[
  {
    "id": 1,
    "name": "Design Thinking (IDEO/Stanford d.school)",
    "name_zh": "设计思维（IDEO/斯坦福模型）",
    "slug": "design-thinking-ideo",
    "category": "thinking",
    "desc": "Human-centered 5-stage empathy-to-test design process",
    "desc_zh": "以人为本的五阶段从共情到测试的设计流程",
    "steps": [
      "Empathize: conduct user interviews, observations, and contextual inquiry to build deep user understanding",
      "Define: synthesize research into a problem statement (Point of View) and 'How Might We' questions",
      "Ideate: run divergent brainstorming sessions to generate a wide range of solution ideas without judgment",
      "Prototype: build low-fidelity tangible representations of your top ideas to make them testable",
      "Test: put prototypes in front of real users, observe reactions, gather feedback, and iterate"
    ],
    "steps_zh": [
      "共情：通过用户访谈、观察和情境调研，建立对用户的深度理解",
      "定义：将研究成果综合为问题陈述（POV）和「我们如何能……」式问题",
      "构思：开展发散式头脑风暴，不加评判地产生大量解决方案创意",
      "原型：为最优创意构建低保真有形原型，使其可被测试验证",
      "测试：将原型呈现给真实用户，观察反应，收集反馈并迭代改进"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Empathize",
      "Define",
      "Ideate",
      "Prototype",
      "Test"
    ],
    "viz_labels_zh": [
      "共情",
      "定义",
      "构思",
      "原型",
      "测试"
    ],
    "related": [
      "jobs-to-be-done",
      "problem-framing-how-now-wow",
      "human-ai-interaction-design"
    ],
    "tags": [
      "design",
      "empathy",
      "user-centered",
      "prototyping",
      "iteration"
    ],
    "origin_author": "David Kelley / Tim Brown (IDEO) & Hasso Plattner Institute of Design at Stanford, 2005",
    "origin_source": "Change by Design: How Design Thinking Transforms Organizations and Inspires Innovation",
    "origin_source_zh": "《设计改变一切：设计思维如何变革组织与激发创新》",
    "complexity": "beginner",
    "when_to_use": [
      "When your team is building a product for a user segment they don't deeply understand yet",
      "When an existing product has stagnating engagement and you need to rediscover unmet user needs",
      "When stakeholders disagree on what the real problem is and you need a structured way to align on user reality",
      "When entering a new market where assumptions about user behavior haven't been validated"
    ],
    "when_to_use_zh": [
      "当团队正在为尚未深入了解的用户群体构建产品时",
      "当现有产品的用户参与度停滞不前，需要重新发现未被满足的用户需求时",
      "当利益相关者对真正的问题存在分歧，需要一种结构化方式来统一对用户现实的认知时",
      "当进入一个用户行为假设尚未被验证的新市场时"
    ],
    "core_concepts": [
      "Empathy: Deeply understanding users' experiences, emotions, and motivations through immersive observation rather than assumptions",
      "Divergent-Convergent Rhythm: Alternating between expansive idea generation (diverge) and focused selection (converge) at each stage",
      "Rapid Prototyping: Building quick, cheap, tangible representations of ideas to learn through making rather than theorizing",
      "Iterative Learning: Treating every test as a learning opportunity that feeds back into earlier stages, not a pass/fail gate"
    ],
    "core_concepts_zh": [
      "共情：通过沉浸式观察而非假设，深入理解用户的体验、情感和动机",
      "发散-收敛节奏：在每个阶段交替进行扩展性创意生成（发散）和聚焦筛选（收敛）",
      "快速原型：构建快速、低成本、有形的创意表现形式，通过动手制作而非纯理论来学习",
      "迭代学习：将每次测试视为反馈到前序阶段的学习机会，而非通过/失败的关卡"
    ],
    "timeline": [
      [
        "1991",
        "David Kelley founds IDEO, embedding human-centered design into product development consulting"
      ],
      [
        "2005",
        "Stanford d.school (Hasso Plattner Institute of Design) launches, formalizing the 5-stage Design Thinking process"
      ],
      [
        "2008",
        "Tim Brown publishes 'Design Thinking' in Harvard Business Review, bringing the method to mainstream business audiences"
      ],
      [
        "2009",
        "Tim Brown's book 'Change by Design' is published, becoming the definitive reference for the framework"
      ],
      [
        "2015",
        "IBM adopts Enterprise Design Thinking at scale, training over 100,000 employees and validating the approach for large organizations"
      ]
    ],
    "timeline_zh": [
      [
        "1991",
        "David Kelley创立IDEO，将以人为本的设计融入产品开发咨询"
      ],
      [
        "2005",
        "斯坦福d.school（哈索·普拉特纳设计学院）成立，正式确立五阶段设计思维流程"
      ],
      [
        "2008",
        "Tim Brown在《哈佛商业评论》发表「设计思维」一文，将该方法推向主流商业视野"
      ],
      [
        "2009",
        "Tim Brown出版《设计改变一切》，成为该框架的权威参考书"
      ],
      [
        "2015",
        "IBM大规模采用企业级设计思维，培训超过10万名员工，验证了该方法在大型组织中的适用性"
      ]
    ],
    "dos": [
      "Do spend more time in the Empathize phase than feels comfortable, because premature problem definition leads to solving the wrong problem",
      "Do build prototypes that are intentionally rough, because polished prototypes discourage honest feedback from users",
      "Do include diverse team members (engineers, designers, business people) in every stage, because cross-functional perspectives prevent blind spots",
      "Do document insights on physical walls or shared boards, because spatial organization reveals patterns that lists cannot"
    ],
    "dos_zh": [
      "在共情阶段花比直觉告诉你的更多时间，因为过早定义问题会导致解决错误的问题",
      "故意将原型做得粗糙，因为精致的原型会阻碍用户给出诚实反馈",
      "在每个阶段都纳入多元团队成员（工程师、设计师、商务人员），因为跨职能视角能防止盲区",
      "将洞察记录在物理墙面或共享看板上，因为空间化组织能揭示列表无法展现的模式"
    ],
    "donts": [
      "Don't skip straight to ideation without user research, because you'll optimize for assumed needs rather than real ones",
      "Don't let the HiPPO (Highest Paid Person's Opinion) override user evidence, because Design Thinking is evidence-driven not authority-driven",
      "Don't treat the 5 stages as a strictly linear waterfall, because real design work requires looping back to earlier stages as you learn",
      "Don't confuse Design Thinking workshops with sustained practice, because a one-off workshop without follow-through produces ideas that never ship"
    ],
    "donts_zh": [
      "不要跳过用户研究直接进入构思阶段，因为你会优化假设的需求而非真实的需求",
      "不要让「最高薪人士意见」压过用户证据，因为设计思维是证据驱动而非权威驱动",
      "不要将五个阶段视为严格线性的瀑布流程，因为真正的设计工作需要在学习中回溯到前序阶段",
      "不要将设计思维工作坊等同于持续实践，因为没有后续跟进的一次性工作坊只会产生永不落地的想法"
    ],
    "case_study_company": "Airbnb",
    "case_study": "In 2009, Airbnb was near bankruptcy with revenue flatlined at $200/week. Founders Joe Gebbia and Brian Chesky applied Design Thinking by flying to New York to visit hosts, empathizing with their struggles, and discovering that poor listing photos were killing trust. They prototyped a solution by personally re-photographing listings with professional cameras, which doubled weekly revenue almost immediately and became a turning point for the company.",
    "case_study_zh": "2009年，Airbnb濒临破产，每周收入仅200美元。创始人Joe Gebbia和Brian Chesky运用设计思维，亲赴纽约拜访房东，共情他们的困境，发现劣质房源照片严重损害了用户信任。他们亲自用专业相机为房源重新拍照作为原型验证，几乎立刻使每周收入翻倍，成为公司的关键转折点。",
    "case_study_challenge": "Airbnb was weeks from shutting down. Revenue had flatlined at $200 per week, and the founders could not figure out why hosts in New York — their biggest market — were failing to convert browsers into guests.",
    "case_study_challenge_zh": "Airbnb距离倒闭仅剩数周。每周收入停滞在200美元，创始人无法理解为何纽约这一最大市场的房东始终无法将浏览者转化为房客。",
    "case_study_approach": "Joe Gebbia and Brian Chesky flew to New York and spent time in hosts' apartments, observing their listings through a guest's eyes. They discovered the root cause: amateur photos made even beautiful spaces look untrustworthy. Rather than build a feature, they grabbed a rented camera and re-shot every listing themselves — a rapid, tangible prototype of the empathy insight.",
    "case_study_approach_zh": "Joe Gebbia和Brian Chesky飞赴纽约，在房东家中实地体验，以房客视角审视每一条房源。他们发现了根本原因：业余照片让再美的空间都显得不可信。他们没有去开发功能，而是租了一台相机，亲自为每条房源重新拍摄——这是对共情洞察最快速、最具体的原型验证。",
    "case_study_result": "Weekly revenue doubled almost immediately after the re-photographed listings went live. The insight became a permanent program — Airbnb's professional photography service — and marked the inflection point that carried the company from near-death to a $100 billion IPO.",
    "case_study_result_zh": "重新拍摄的房源上线后，每周收入几乎立刻翻倍。这一洞察演变为永久项目——Airbnb专业摄影服务——并成为公司从濒临倒闭走向千亿美元IPO的关键转折点。",
    "case_study_quote": "It was the turning point. We went from $200 a week to $400 a week. That was the moment we realized: get out of the building.",
    "case_study_quote_zh": "那是转折点。我们从每周200美元涨到了400美元。那一刻我们明白了：必须走出办公室。",
    "when_not_to_use": [
      "When the problem is well-understood and the solution is a known engineering optimization with clear metrics",
      "When operating under extreme time pressure (hours, not days) where rapid heuristic decisions are needed",
      "When the domain is highly regulated with predetermined requirements that leave no room for user-driven discovery",
      "When the team lacks access to real end users and cannot conduct meaningful empathy research"
    ],
    "when_not_to_use_zh": [
      "当问题已被充分理解且解决方案是有明确指标的已知工程优化时",
      "在极端时间压力下（几小时而非几天），需要快速启发式决策时",
      "当领域高度监管，预设需求不留用户驱动发现的空间时",
      "当团队无法接触真实终端用户，无法进行有意义的共情研究时"
    ],
    "adopters": [
      "IBM",
      "Google",
      "SAP",
      "PepsiCo",
      "Bank of America",
      "Intuit"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Tim Brown (2009). \"Change by Design: How Design Thinking Transforms Organizations and Inspires Innovation\". Harper Business.",
    "secondary_sources": [
      "Herbert A. Simon (1969). \"The Sciences of the Artificial\". MIT Press.",
      "Nigel Cross (2011). \"Design Thinking: Understanding How Designers Think and Work\". Berg Publishers."
    ],
    "typed_relations": [
      {
        "slug": "jobs-to-be-done",
        "type": "complement"
      },
      {
        "slug": "problem-framing-how-now-wow",
        "type": "complement"
      },
      {
        "slug": "human-ai-interaction-design",
        "type": "extends"
      }
    ]
  },
  {
    "id": 2,
    "name": "Domain-Driven Design (DDD)",
    "name_zh": "领域驱动设计",
    "slug": "domain-driven-design",
    "category": "thinking",
    "desc": "Model software around core business domain language and logic",
    "desc_zh": "围绕核心业务领域语言与逻辑构建软件模型",
    "steps": [
      "Knowledge Crunching: collaborate with domain experts to extract and refine the core domain model through iterative conversation",
      "Define the Ubiquitous Language: establish a shared, precise vocabulary used consistently in code, docs, and team discussion",
      "Identify Bounded Contexts: draw explicit boundaries around each subdomain where a particular model applies without ambiguity",
      "Model the Core Domain: design entities, value objects, aggregates, and domain events that reflect real business rules",
      "Map Context Relationships: define integration patterns (Anti-Corruption Layer, Shared Kernel, Open Host) between bounded contexts"
    ],
    "steps_zh": [
      "知识提炼：与领域专家协作，通过反复对话提取并精化核心领域模型",
      "定义统一语言：建立在代码、文档和团队讨论中统一使用的精确词汇表",
      "识别限界上下文：为每个子领域划定明确边界，确保模型在边界内无歧义",
      "建模核心领域：设计能够反映真实业务规则的实体、值对象、聚合与领域事件",
      "映射上下文关系：定义限界上下文之间的集成模式（防腐层、共享内核、开放主机等）"
    ],
    "ai_relevant": true,
    "viz_type": "venn",
    "viz_labels": [
      "Ubiquitous Language",
      "Bounded Context",
      "Context Map"
    ],
    "viz_labels_zh": [
      "统一语言",
      "限界上下文",
      "上下文映射"
    ],
    "related": [
      "ddd-tactical-patterns",
      "cqrs-pattern",
      "event-sourcing-pattern"
    ],
    "tags": [
      "domain-modeling",
      "bounded-context",
      "ubiquitous-language",
      "strategic-design"
    ],
    "origin_author": "Eric Evans, 2003",
    "origin_source": "Domain-Driven Design: Tackling Complexity in the Heart of Software",
    "origin_source_zh": "《领域驱动设计：软件核心复杂性应对之道》",
    "complexity": "advanced",
    "when_to_use": [
      "When your system's core complexity lies in the business rules rather than technical infrastructure",
      "When multiple teams need to work on the same large system and you need clear ownership boundaries",
      "When domain experts and developers consistently miscommunicate because they use different terminology",
      "When a legacy monolith needs to be decomposed into services along meaningful business boundaries"
    ],
    "when_to_use_zh": [
      "当系统的核心复杂性在于业务规则而非技术基础设施时",
      "当多个团队需要在同一大型系统上协作，需要明确的职责边界时",
      "当领域专家和开发者因使用不同术语而持续产生沟通障碍时",
      "当遗留单体应用需要沿有意义的业务边界拆分为服务时"
    ],
    "core_concepts": [
      "Ubiquitous Language: A shared vocabulary between developers and domain experts that is used consistently in code, conversation, and documentation to eliminate translation errors",
      "Bounded Context: An explicit boundary within which a particular domain model is defined and applicable, preventing one team's model from corrupting another's",
      "Aggregate: A cluster of domain objects treated as a single unit for data changes, enforcing consistency boundaries within a bounded context",
      "Context Mapping: The practice of explicitly documenting relationships and integration patterns between bounded contexts to manage dependencies"
    ],
    "core_concepts_zh": [
      "统一语言：开发者与领域专家之间的共享词汇表，在代码、对话和文档中一致使用，以消除翻译错误",
      "限界上下文：定义特定领域模型适用范围的显式边界，防止一个团队的模型污染另一个团队的模型",
      "聚合：作为数据变更单元处理的领域对象集群，在限界上下文内强制一致性边界",
      "上下文映射：显式记录限界上下文之间的关系和集成模式以管理依赖的实践"
    ],
    "timeline": [
      [
        "2003",
        "Eric Evans publishes 'Domain-Driven Design: Tackling Complexity in the Heart of Software', introducing DDD concepts"
      ],
      [
        "2006",
        "The DDD community begins forming around the Yahoo Groups mailing list and early conferences, spreading strategic design patterns"
      ],
      [
        "2013",
        "Vaughn Vernon publishes 'Implementing Domain-Driven Design', providing practical implementation guidance that accelerates adoption"
      ],
      [
        "2015",
        "Microservices architecture popularizes Bounded Context as the natural service decomposition boundary"
      ],
      [
        "2019",
        "Event Storming (by Alberto Brandolini) becomes the dominant collaborative DDD modeling technique in enterprise teams"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "Eric Evans出版《领域驱动设计：软件核心复杂性应对之道》，引入DDD核心概念"
      ],
      [
        "2006",
        "DDD社区围绕Yahoo Groups邮件列表和早期会议逐步形成，传播战略设计模式"
      ],
      [
        "2013",
        "Vaughn Vernon出版《实现领域驱动设计》，提供实践指导，加速了DDD的采用"
      ],
      [
        "2015",
        "微服务架构的流行使限界上下文成为服务拆分的天然边界"
      ],
      [
        "2019",
        "Alberto Brandolini提出的事件风暴成为企业团队中最主流的协作式DDD建模技术"
      ]
    ],
    "dos": [
      "Do invest heavily in conversations with domain experts before writing code, because the model quality depends on how well you understand the domain",
      "Do name classes, methods, and variables using the Ubiquitous Language, because code that speaks the domain's language is self-documenting and catches modeling errors early",
      "Do keep aggregates small and focused on consistency boundaries, because large aggregates create contention and scalability problems",
      "Do draw context maps as living documents, because team and system boundaries evolve and stale maps cause integration surprises"
    ],
    "dos_zh": [
      "在编写代码前大量投入与领域专家的对话，因为模型质量取决于你对领域理解的深度",
      "使用统一语言命名类、方法和变量，因为使用领域语言的代码本身就是文档，并能尽早捕获建模错误",
      "保持聚合小而聚焦于一致性边界，因为过大的聚合会产生竞争和可扩展性问题",
      "将上下文映射图作为活文档维护，因为团队和系统边界会演化，过时的映射图会导致集成意外"
    ],
    "donts": [
      "Don't apply DDD tactical patterns (entities, value objects, repositories) without first doing strategic design, because patterns without bounded contexts create distributed monoliths",
      "Don't force every subdomain to use rich domain models, because supporting and generic subdomains often work fine with simpler CRUD approaches",
      "Don't let the database schema drive your domain model, because persistence is an infrastructure concern that should adapt to the model, not the reverse",
      "Don't create a single universal model shared across all teams, because different contexts genuinely need different representations of the same concept"
    ],
    "donts_zh": [
      "不要在没有先做战略设计的情况下直接使用DDD战术模式（实体、值对象、仓储），因为没有限界上下文的模式会创建分布式单体",
      "不要强制每个子领域都使用富领域模型，因为支撑域和通用域用简单的CRUD方式通常就能很好地工作",
      "不要让数据库模式驱动领域模型，因为持久化是基础设施关注点，应该适配模型而非反过来",
      "不要创建一个跨所有团队共享的单一通用模型，因为不同上下文确实需要对同一概念有不同的表示"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix applied DDD principles when decomposing their monolithic DVD rental system into microservices for streaming. They identified distinct bounded contexts such as content catalog, subscriber management, recommendation engine, and streaming delivery, each with its own ubiquitous language and team ownership. This decomposition enabled teams to deploy independently and scale each context according to its unique load patterns, supporting the company's growth from 20 million to over 200 million subscribers.",
    "case_study_zh": "Netflix在将单体DVD租赁系统拆分为流媒体微服务时应用了DDD原则。他们识别出内容目录、订阅者管理、推荐引擎和流媒体分发等不同的限界上下文，每个上下文都有自己的统一语言和团队所有权。这种拆分使各团队能独立部署并根据各上下文独特的负载模式进行扩展，支撑了公司从2000万增长到超过2亿订阅者。",
    "case_study_challenge": "Netflix's monolithic DVD rental system was buckling under the pivot to streaming. A single codebase owned by everyone meant that a change to the recommendation engine could break billing, and deployment cycles stretched to weeks as coordination overhead consumed the engineering organization.",
    "case_study_challenge_zh": "Netflix的单体DVD租赁系统在向流媒体转型中不堪重负。一个所有人共有的单一代码库意味着推荐引擎的改动可能破坏计费系统，而部署周期因协调成本拉长至数周。",
    "case_study_approach": "The team applied DDD to identify natural bounded contexts — content catalog, subscriber management, recommendation engine, streaming delivery — each with its own ubiquitous language and dedicated team ownership. Services communicated through well-defined context maps rather than shared databases.",
    "case_study_approach_zh": "团队运用DDD识别出天然的限界上下文——内容目录、订阅者管理、推荐引擎、流媒体分发——每个上下文拥有独立的统一语言和专属团队。服务间通过定义清晰的上下文映射通信，而非共享数据库。",
    "case_study_result": "Independent deployment and context-specific scaling enabled Netflix to grow from 20 million to over 200 million subscribers without architectural collapse. Each team could evolve its domain model at its own pace, and the bounded context boundaries proved remarkably stable even as the underlying technology changed.",
    "case_study_result_zh": "独立部署和按上下文弹性扩展使Netflix从2000万用户增长到超过2亿，架构依然稳固。每个团队能按自己的节奏演进领域模型，限界上下文的边界即使在底层技术更迭后也保持了惊人的稳定。",
    "case_study_quote": "The best thing we did was identify where one team's language ended and another's began. That's where the service boundaries drew themselves.",
    "case_study_quote_zh": "我们做过最正确的事，就是找到一个团队的语言在哪里结束、另一个团队的语言从哪里开始。服务边界就在那里自然浮现。",
    "when_not_to_use": [
      "When building a simple CRUD application with minimal business logic where the overhead of domain modeling isn't justified",
      "When the team has no access to domain experts and cannot establish a meaningful ubiquitous language",
      "When working on a short-lived prototype or proof of concept where speed of delivery matters more than model accuracy",
      "When the project is purely technical infrastructure (logging, monitoring, CI/CD) with no business domain to model"
    ],
    "when_not_to_use_zh": [
      "当构建业务逻辑极简的CRUD应用，领域建模的开销不合理时",
      "当团队无法接触领域专家，无法建立有意义的统一语言时",
      "当开发短期原型或概念验证，交付速度比模型准确性更重要时",
      "当项目是纯技术基础设施（日志、监控、CI/CD），没有业务领域需要建模时"
    ],
    "adopters": [
      "Netflix",
      "Spotify",
      "Zalando",
      "Thoughtworks",
      "VMware (Spring Team)",
      "Just Eat Takeaway"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Eric Evans (2003). \"Domain-Driven Design: Tackling Complexity in the Heart of Software\". Addison-Wesley.",
    "secondary_sources": [
      "Vaughn Vernon (2013). \"Implementing Domain-Driven Design\". Addison-Wesley.",
      "Eric Evans (2015). \"Domain-Driven Design Reference\". Domain Language, Inc."
    ],
    "typed_relations": [
      {
        "slug": "ddd-tactical-patterns",
        "type": "extends"
      },
      {
        "slug": "cqrs-pattern",
        "type": "complement"
      },
      {
        "slug": "event-sourcing-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 3,
    "name": "Systems Thinking",
    "name_zh": "系统思维",
    "slug": "systems-thinking",
    "category": "thinking",
    "desc": "Analyze software as interconnected feedback loops, not parts",
    "desc_zh": "将软件视为相互关联的反馈回路而非孤立部件进行分析",
    "steps": [
      "Identify the System Boundary: define what is inside vs. outside the system and where key interfaces lie",
      "Map Stocks and Flows: identify accumulations (data stores, queues, state) and the flows that increase or decrease them",
      "Trace Feedback Loops: find reinforcing loops (growth/decay) and balancing loops (stabilization) within the system",
      "Identify Leverage Points: locate places in the system where small changes produce large, lasting effects",
      "Simulate and Validate: build causal loop diagrams or simple simulations to test mental models against observed behavior"
    ],
    "steps_zh": [
      "界定系统边界：明确系统内外的范围以及关键接口所在位置",
      "绘制存量与流量：识别系统中的积累要素（数据存储、队列、状态）及其增减流量",
      "追踪反馈回路：发现系统内的增强回路（增长/衰减）和调节回路（稳定化）",
      "识别杠杆点：找到系统中以小改动产生持久大影响的关键位置",
      "模拟与验证：构建因果回路图或简单仿真，对照观察行为检验心智模型"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "System",
      "Feedback Loop",
      "Emergence",
      "Leverage Point",
      "Delay"
    ],
    "viz_labels_zh": [
      "系统",
      "反馈回路",
      "涌现",
      "杠杆点",
      "滞后效应"
    ],
    "related": [
      "cynefin-framework",
      "wardley-mapping",
      "conways-law"
    ],
    "tags": [
      "systems",
      "feedback-loops",
      "leverage-points",
      "holistic-analysis"
    ],
    "origin_author": "Donella Meadows / Jay Forrester (MIT), 1950s-1990s",
    "origin_source": "Thinking in Systems: A Primer (Donella Meadows, 2008)",
    "origin_source_zh": "《系统之美：决策者的系统思考》（多内拉·梅多斯，2008年）",
    "complexity": "intermediate",
    "when_to_use": [
      "When a production system exhibits surprising emergent behavior that can't be explained by looking at individual components",
      "When fixing one problem in your architecture keeps creating new problems elsewhere (whack-a-mole symptoms)",
      "When scaling decisions have second-order effects on team structure, deployment, and operational costs",
      "When planning a platform migration and you need to understand cascading dependencies across services"
    ],
    "when_to_use_zh": [
      "当生产系统表现出无法通过审视单个组件来解释的意外涌现行为时",
      "当修复架构中的一个问题总是在别处引发新问题（打地鼠式症状）时",
      "当扩展决策对团队结构、部署和运营成本产生二阶效应时",
      "当规划平台迁移并需要理解跨服务的级联依赖时"
    ],
    "core_concepts": [
      "Feedback Loops: Reinforcing loops amplify change (growth or collapse) while balancing loops resist change (stabilization), and most system behavior emerges from their interaction",
      "Stocks and Flows: Stocks are accumulations (queues, caches, data stores) that change over time through inflows and outflows, and their delays often cause counterintuitive behavior",
      "Leverage Points: Places in the system where a small intervention produces disproportionately large effects, ranked from shallow (parameter tweaks) to deep (paradigm shifts)",
      "Emergence: System-level properties that arise from component interactions but cannot be predicted from studying components in isolation"
    ],
    "core_concepts_zh": [
      "反馈回路：增强回路放大变化（增长或崩溃），调节回路抵抗变化（稳定化），大多数系统行为源自二者的交互",
      "存量与流量：存量是随时间通过流入和流出而变化的积累（队列、缓存、数据存储），其延迟往往导致反直觉的行为",
      "杠杆点：系统中小干预能产生不成比例大效果的位置，从浅层（参数调整）到深层（范式转换）排列",
      "涌现：从组件交互中产生的系统级属性，无法通过孤立研究组件来预测"
    ],
    "timeline": [
      [
        "1956",
        "Jay Forrester at MIT develops System Dynamics, creating the mathematical foundation for modeling complex systems with feedback"
      ],
      [
        "1972",
        "The Club of Rome publishes 'The Limits to Growth', using systems dynamics models to analyze global resource constraints, bringing systems thinking to public awareness"
      ],
      [
        "1990",
        "Peter Senge publishes 'The Fifth Discipline', introducing systems thinking as a core management practice for learning organizations"
      ],
      [
        "1997",
        "Donella Meadows publishes 'Leverage Points: Places to Intervene in a System', providing the influential 12-point hierarchy of system interventions"
      ],
      [
        "2008",
        "Meadows' posthumous book 'Thinking in Systems: A Primer' is published, becoming the most accessible introduction to systems thinking"
      ]
    ],
    "timeline_zh": [
      [
        "1956",
        "MIT的Jay Forrester开发系统动力学，为带反馈的复杂系统建模奠定数学基础"
      ],
      [
        "1972",
        "罗马俱乐部发表《增长的极限》，用系统动力学模型分析全球资源约束，将系统思维带入公众视野"
      ],
      [
        "1990",
        "Peter Senge出版《第五项修炼》，将系统思维引入为学习型组织的核心管理实践"
      ],
      [
        "1997",
        "Donella Meadows发表「杠杆点：系统中的干预位置」，提出影响深远的12级系统干预层次"
      ],
      [
        "2008",
        "Meadows遗作《系统之美》出版，成为系统思维最易读的入门书籍"
      ]
    ],
    "dos": [
      "Do draw causal loop diagrams before proposing solutions, because visualizing feedback structures reveals non-obvious dynamics",
      "Do look for delays in the system (queue backlogs, cache staleness, deployment pipelines), because delays are the most common source of system oscillation",
      "Do identify which loops are dominant under different conditions, because the same system behaves differently at different scales or loads",
      "Do consider the system boundary carefully, because drawing it too narrow misses critical external feedback and drawing it too wide makes analysis intractable"
    ],
    "dos_zh": [
      "在提出解决方案前先画因果回路图，因为可视化反馈结构能揭示不明显的动态特性",
      "寻找系统中的延迟（队列积压、缓存陈旧、部署管线），因为延迟是系统振荡最常见的根源",
      "识别在不同条件下哪些回路占主导，因为同一系统在不同规模或负载下表现不同",
      "仔细考虑系统边界，因为边界画得太窄会遗漏关键的外部反馈，太宽则使分析无法进行"
    ],
    "donts": [
      "Don't assume linear cause-and-effect in complex systems, because feedback loops create non-linear dynamics where effects can amplify or counteract their causes",
      "Don't optimize a single component without considering system-wide effects, because local optimization often degrades global performance",
      "Don't ignore time delays between actions and consequences, because delayed feedback is the primary reason well-intentioned changes produce opposite results",
      "Don't treat the system model as complete truth, because all models are simplifications and must be validated against observed behavior"
    ],
    "donts_zh": [
      "不要在复杂系统中假设线性因果关系，因为反馈回路会产生非线性动态，效果可能放大或抵消其原因",
      "不要在不考虑系统范围影响的情况下优化单个组件，因为局部优化常常降低全局性能",
      "不要忽略行动与结果之间的时间延迟，因为延迟反馈是善意变更产生相反结果的主要原因",
      "不要将系统模型视为完整的真相，因为所有模型都是简化，必须对照观测行为进行验证"
    ],
    "case_study_company": "Amazon",
    "case_study": "Amazon applied systems thinking to understand and amplify its marketplace flywheel: more sellers lead to more selection, which attracts more customers, which attracts more sellers (a reinforcing loop), while higher volume drives lower costs, enabling lower prices (a second reinforcing loop). By identifying these feedback loops, Amazon strategically invested in fulfillment infrastructure as the leverage point that accelerated both loops simultaneously, fueling decades of compounding growth.",
    "case_study_zh": "Amazon运用系统思维理解并放大其市场飞轮效应：更多卖家带来更多商品选择，吸引更多客户，继而吸引更多卖家（增强回路），同时更高的交易量降低成本，使价格更低（第二个增强回路）。通过识别这些反馈回路，Amazon战略性地投资履约基础设施作为杠杆点，同时加速两个回路，推动了数十年的复合增长。",
    "when_not_to_use": [
      "When the problem is genuinely linear and isolated with no significant feedback effects or cross-component dependencies",
      "When you need an immediate tactical fix and the system analysis would take longer than the fix itself",
      "When the team lacks the time or expertise to build and validate system models, risking analysis paralysis",
      "When the scope is a single well-bounded function or algorithm with no external interactions to model"
    ],
    "when_not_to_use_zh": [
      "当问题确实是线性且孤立的，没有显著的反馈效应或跨组件依赖时",
      "当需要即时战术修复，而系统分析所需时间比修复本身更长时",
      "当团队缺乏构建和验证系统模型的时间或专业能力，存在分析瘫痪风险时",
      "当范围是单个有明确边界的函数或算法，没有需要建模的外部交互时"
    ],
    "adopters": [
      "Amazon",
      "Toyota",
      "World Bank",
      "NASA",
      "Shell",
      "Intel"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Donella Meadows (2008). \"Thinking in Systems: A Primer\". Chelsea Green Publishing.",
    "secondary_sources": [
      "Jay W. Forrester (1961). \"Industrial Dynamics\". MIT Press.",
      "Peter Senge (1990). \"The Fifth Discipline: The Art & Practice of the Learning Organization\". Doubleday."
    ],
    "typed_relations": [
      {
        "slug": "cynefin-framework",
        "type": "complement"
      },
      {
        "slug": "wardley-mapping",
        "type": "complement"
      },
      {
        "slug": "conways-law",
        "type": "related"
      }
    ]
  },
  {
    "id": 4,
    "name": "Jobs-to-Be-Done (JTBD)",
    "name_zh": "待完成任务理论",
    "slug": "jobs-to-be-done",
    "category": "thinking",
    "desc": "Frame user needs as functional, social, emotional 'jobs'",
    "desc_zh": "将用户需求框架化为功能性、社会性、情感性「任务」",
    "steps": [
      "Identify the Job: define the core progress the user is trying to make in a specific circumstance (not the feature they want)",
      "Unpack Job Dimensions: separate the functional job (task), emotional job (feeling), and social job (perception by others)",
      "Write Job Statements: craft statements in the form 'When [situation], I want to [motivation], so I can [outcome]'",
      "Discover Struggling Moments: find where and why users struggle to get the job done with existing solutions",
      "Map Solution Fit: design or evaluate features by how well they help users hire your software to complete each job"
    ],
    "steps_zh": [
      "识别任务：定义用户在特定情境下试图实现的核心进展（而非他们想要的功能）",
      "拆解任务维度：区分功能性任务（要做什么）、情感性任务（感受如何）和社会性任务（他人如何看待）",
      "撰写任务陈述：采用「当[情境]时，我想要[动机]，以便[结果]」的格式撰写陈述",
      "发现挣扎时刻：找出用户在完成任务时与现有方案的挣扎点及其原因",
      "映射方案契合度：通过功能帮助用户「雇佣」软件完成任务的程度来设计或评估功能"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Functional Job",
      "Emotional Job",
      "Outcome",
      "Hire/Fire"
    ],
    "viz_labels_zh": [
      "功能性目标",
      "情感性目标",
      "预期结果",
      "雇用/解雇"
    ],
    "related": [
      "design-thinking-ideo",
      "problem-framing-how-now-wow"
    ],
    "tags": [
      "user-needs",
      "product-strategy",
      "innovation",
      "customer-research"
    ],
    "origin_author": "Clayton Christensen / Tony Ulwick, 1990s-2003",
    "origin_source": "The Innovator's Solution: Creating and Sustaining Successful Growth (Clayton Christensen, 2003)",
    "origin_source_zh": "《创新者的解答：经济不确定期的创新指南》（克莱顿·克里斯滕森，2003年）",
    "complexity": "intermediate",
    "when_to_use": [
      "When your product roadmap is driven by feature requests rather than understanding why users need those features",
      "When you're trying to understand why customers switch from a competitor's product to yours (or vice versa)",
      "When you need to identify innovation opportunities in an established market where demographic segmentation isn't revealing actionable insights",
      "When your team debates features but can't articulate what progress users are trying to make in their lives"
    ],
    "when_to_use_zh": [
      "当产品路线图由功能请求驱动，而非基于理解用户为何需要这些功能时",
      "当你试图理解客户为何从竞争对手的产品切换到你的产品（或反之）时",
      "当需要在成熟市场中识别创新机会，而人口统计细分未能揭示可操作洞察时",
      "当团队争论功能但无法清晰表达用户试图在生活中实现什么进展时"
    ],
    "core_concepts": [
      "The Job: The progress a person is trying to make in a particular circumstance — not the product they buy, but the underlying need the product is 'hired' to fulfill",
      "Hiring and Firing: Customers 'hire' products to do jobs and 'fire' them when they fail, providing a lens to understand adoption and churn beyond satisfaction scores",
      "Struggling Moments: Specific moments when existing solutions fail to help users make progress, representing the highest-value innovation opportunities",
      "Circumstance-Based Segmentation: Grouping users by the circumstances in which they encounter a job, rather than by demographics, which reveals more actionable patterns"
    ],
    "core_concepts_zh": [
      "任务：一个人在特定情境下试图实现的进展——不是他们购买的产品，而是产品被「雇佣」来满足的根本需求",
      "雇佣与解雇：客户「雇佣」产品来完成任务，当产品失败时「解雇」它们，提供了超越满意度评分来理解采纳和流失的视角",
      "挣扎时刻：现有方案无法帮助用户取得进展的特定时刻，代表最高价值的创新机会",
      "基于情境的细分：按用户遇到任务的情境而非按人口统计进行分组，能揭示更具可操作性的模式"
    ],
    "timeline": [
      [
        "1991",
        "Tony Ulwick begins developing Outcome-Driven Innovation (ODI), the methodological precursor to formal JTBD frameworks"
      ],
      [
        "1997",
        "Clayton Christensen introduces the concept of jobs in 'The Innovator's Dilemma', framing disruption around unmet customer jobs"
      ],
      [
        "2003",
        "Christensen and Raynor publish 'The Innovator's Solution', fully articulating the Jobs-to-Be-Done theory"
      ],
      [
        "2016",
        "Christensen publishes 'Competing Against Luck', making JTBD accessible to a broader business audience with the famous milkshake case study"
      ],
      [
        "2016",
        "Bob Moesta and Chris Spiek publish 'Demand-Side Sales 101', extending JTBD into the Switch Interview methodology for understanding buying decisions"
      ]
    ],
    "timeline_zh": [
      [
        "1991",
        "Tony Ulwick开始开发结果导向创新（ODI），这是正式JTBD框架的方法论前身"
      ],
      [
        "1997",
        "Clayton Christensen在《创新者的窘境》中引入任务概念，围绕未被满足的客户任务来框定颠覆"
      ],
      [
        "2003",
        "Christensen与Raynor出版《创新者的解答》，全面阐述待完成任务理论"
      ],
      [
        "2016",
        "Christensen出版《与运气竞争》，通过著名的奶昔案例使JTBD为更广泛的商业受众所了解"
      ],
      [
        "2016",
        "Bob Moesta和Chris Spiek出版《需求侧销售101》，将JTBD扩展为理解购买决策的转换访谈方法"
      ]
    ],
    "dos": [
      "Do interview users about the specific circumstances in which they last 'hired' or 'fired' a solution, because circumstantial context reveals the real job better than abstract questions",
      "Do separate functional, emotional, and social dimensions when writing job statements, because products that address only functional needs often lose to those that also address emotional and social dimensions",
      "Do focus on the job's stability over time, because while solutions change rapidly, the underlying jobs humans need done remain remarkably constant",
      "Do look for non-consumption (people who aren't using any solution), because non-consumers often reveal the largest innovation opportunities"
    ],
    "dos_zh": [
      "访谈用户最近一次「雇佣」或「解雇」某方案的具体情境，因为情境上下文比抽象问题更能揭示真实任务",
      "撰写任务陈述时分离功能、情感和社会维度，因为只满足功能需求的产品往往输给同时满足情感和社会维度的产品",
      "关注任务随时间的稳定性，因为虽然解决方案快速变化，但人类需要完成的底层任务却保持惊人的恒定",
      "寻找非消费现象（不使用任何方案的人群），因为非消费者往往揭示最大的创新机会"
    ],
    "donts": [
      "Don't confuse the job with the solution, because 'I want a faster horse' describes a solution while 'I need to get across town reliably in under 20 minutes' describes the job",
      "Don't rely solely on what users say they want, because people are poor at predicting their own behavior — observe what they actually do in struggling moments",
      "Don't segment customers by demographics alone, because a 25-year-old and a 55-year-old may hire the same product for the same job in the same circumstance",
      "Don't write job statements that are too broad ('I want to be happy') or too narrow ('I want button X to be blue'), because useful jobs are specific enough to act on but broad enough to inspire solutions"
    ],
    "donts_zh": [
      "不要混淆任务与解决方案，因为「我想要一匹更快的马」描述的是方案，而「我需要在20分钟内可靠地穿越城镇」描述的才是任务",
      "不要仅依赖用户说他们想要什么，因为人们不擅长预测自己的行为——要观察他们在挣扎时刻实际做了什么",
      "不要仅按人口统计细分客户，因为25岁和55岁的人可能在同一情境下为同一任务雇佣同一产品",
      "不要将任务陈述写得过于宽泛（「我想要快乐」）或过于狭窄（「我想让按钮X变蓝」），因为有用的任务需要足够具体以便行动，又足够宽泛以激发方案"
    ],
    "case_study_company": "Intercom",
    "case_study": "Intercom adopted JTBD as their core product strategy framework after co-founder Des Traynor recognized their customers weren't buying 'a messaging tool' but hiring Intercom for specific jobs like 'onboard new users to reduce churn' and 'qualify inbound leads without adding headcount.' By reorganizing their product roadmap around these jobs rather than feature requests, Intercom focused development on the outcomes customers actually valued, helping them grow from a startup to a $1.3B valuation while maintaining strong product-market fit.",
    "case_study_zh": "Intercom在联合创始人Des Traynor意识到客户并非在购买「即时通讯工具」，而是在为特定任务雇佣Intercom（如「引导新用户以降低流失率」和「在不增加人力的情况下筛选入站线索」）后，将JTBD作为核心产品战略框架。通过围绕这些任务而非功能请求重组产品路线图，Intercom将开发聚焦于客户真正看重的结果，帮助他们从初创公司成长到13亿美元估值，同时保持强劲的产品-市场契合度。",
    "when_not_to_use": [
      "When building commodity infrastructure where the 'job' is universal and well-understood (e.g., DNS resolution, basic file storage)",
      "When the product requirement is precisely specified by regulation or technical standards, leaving no room for job-based discovery",
      "When operating in a pure B2B context where the 'buyer' and 'user' are so different that job analysis requires two separate tracks that may conflict",
      "When the project scope is a minor UI polish or bug fix where the job has already been identified and validated"
    ],
    "when_not_to_use_zh": [
      "当构建「任务」普遍且被充分理解的商品化基础设施（如DNS解析、基本文件存储）时",
      "当产品需求由法规或技术标准精确规定，不留基于任务发现空间时",
      "当处于纯B2B场景中，「购买者」和「使用者」差异极大，任务分析需要两条可能冲突的独立路径时",
      "当项目范围是已识别并验证任务的小型UI优化或缺陷修复时"
    ],
    "adopters": [
      "Intercom",
      "Basecamp",
      "Spotify",
      "Shopify",
      "Microsoft",
      "Autodesk"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Clayton Christensen (2003). \"The Innovator's Solution: Creating and Sustaining Successful Growth\". Harvard Business Review Press.",
    "secondary_sources": [
      "Tony Ulwick (2005). \"What Customers Want: Using Outcome-Driven Innovation to Create Breakthrough Products and Services\". McGraw-Hill.",
      "Clayton Christensen et al. (2016). \"Competing Against Luck: The Story of Innovation and Customer Choice\". Harper Business."
    ],
    "typed_relations": [
      {
        "slug": "design-thinking-ideo",
        "type": "complement"
      },
      {
        "slug": "problem-framing-how-now-wow",
        "type": "related"
      }
    ]
  },
  {
    "id": 5,
    "name": "First Principles Thinking",
    "name_zh": "第一性原理思维",
    "slug": "first-principles-thinking",
    "category": "thinking",
    "desc": "Decompose problems to foundational truths, then rebuild up",
    "desc_zh": "将问题分解至基本真理，再从底层重新构建解决方案",
    "steps": [
      "Question Assumptions: list every assumption baked into the current approach and mark which ones are truly constraints vs. conventions",
      "Decompose to Fundamentals: keep asking 'why?' and 'what is actually required?' until you reach irreducible physical or logical facts",
      "Enumerate Constraints: distinguish hard constraints (physics, math, regulation) from soft constraints (habit, legacy, preference)",
      "Reconstruct from Scratch: design the solution using only the fundamental truths, ignoring prior art and existing patterns",
      "Validate and Benchmark: compare your reconstructed design against the original to quantify improvement and identify overlooked factors"
    ],
    "steps_zh": [
      "质疑假设：列出当前方案中内嵌的所有假设，并区分哪些是真正的约束，哪些只是惯例",
      "分解至基本要素：持续追问「为什么」和「真正需要什么」，直至抵达不可再分的物理或逻辑事实",
      "枚举约束条件：区分硬约束（物理、数学、法规）与软约束（习惯、遗留、偏好）",
      "从零重新构建：仅利用基本事实设计解决方案，忽略现有的模式和先例",
      "验证与对标：将重构设计与原方案对比，量化改进幅度并识别被忽视的因素"
    ],
    "ai_relevant": false,
    "viz_type": "pyramid",
    "viz_labels": [
      "Assumptions",
      "Decompose",
      "Core Truth",
      "Rebuild"
    ],
    "viz_labels_zh": [
      "现有假设",
      "层层拆解",
      "基本原理",
      "重新构建"
    ],
    "related": [
      "analogical-thinking",
      "six-thinking-hats"
    ],
    "tags": [
      "reasoning",
      "decomposition",
      "fundamentals",
      "innovation"
    ],
    "origin_author": "Aristotle (classical origin); popularized in tech by Elon Musk, 2002-present",
    "origin_source": "Aristotle's Metaphysics (classical foundation); modern popularization through Elon Musk's public talks and interviews",
    "origin_source_zh": "亚里士多德《形而上学》（经典基础）；现代由埃隆·马斯克的公开演讲和访谈推广",
    "complexity": "intermediate",
    "when_to_use": [
      "When industry best practices haven't changed in years and you suspect there's a fundamentally better approach hidden beneath convention",
      "When cost reduction targets are 10x, not 10%, and incremental optimization of existing approaches cannot reach the goal",
      "When your team is stuck in 'we've always done it this way' thinking and needs a structured method to break free from legacy assumptions",
      "When evaluating whether to build vs. buy and the market solutions all share the same architectural assumptions you question"
    ],
    "when_to_use_zh": [
      "当行业最佳实践多年未变，你怀疑惯例之下隐藏着根本性更好的方法时",
      "当成本削减目标是10倍而非10%，现有方法的渐进优化无法达到目标时",
      "当团队陷入「我们一直都这样做」的思维定式，需要结构化方法来突破遗留假设时",
      "当评估自建还是外购，而市场解决方案都共享你所质疑的相同架构假设时"
    ],
    "core_concepts": [
      "Fundamental Truths: Irreducible facts (physics, mathematics, logic) that remain true regardless of context, convention, or opinion — the bedrock upon which novel solutions can be built",
      "Assumption Identification: The disciplined practice of surfacing every hidden assumption in current thinking, separating genuine constraints from inherited conventions",
      "Reasoning by First Principles vs. Analogy: Instead of reasoning by analogy ('others do it this way, so we should too'), first principles reasoning builds solutions from ground truth",
      "Constraint Spectrum: Recognizing that constraints exist on a spectrum from immutable (laws of physics) to purely conventional (industry habits), and most assumed constraints are actually soft"
    ],
    "core_concepts_zh": [
      "基本真理：无论背景、惯例或观点如何都成立的不可简化事实（物理、数学、逻辑）——构建新颖方案的基石",
      "假设识别：有纪律地浮现当前思维中的每一个隐含假设，将真正的约束与继承的惯例分开",
      "第一性原理推理 vs. 类比推理：不同于类比推理（「别人都这样做，所以我们也应该」），第一性原理推理从根本事实出发构建方案",
      "约束光谱：认识到约束存在于从不可改变（物理定律）到纯粹惯例（行业习惯）的光谱上，大多数假定的约束实际上是软性的"
    ],
    "timeline": [
      [
        "~350 BC",
        "Aristotle defines first principles (archai) in Metaphysics as the foundational basis of knowledge that cannot be deduced from other propositions"
      ],
      [
        "1687",
        "Isaac Newton's Principia Mathematica demonstrates first principles reasoning by deriving mechanics from three fundamental laws"
      ],
      [
        "2002",
        "Elon Musk applies first principles thinking to rocketry at SpaceX, questioning why rockets cost so much by decomposing to raw material costs"
      ],
      [
        "2012",
        "Musk's famous battery cost analysis at Tesla demonstrates the method publicly: industry said batteries cost $600/kWh, first principles analysis of raw materials showed $80/kWh was achievable"
      ],
      [
        "2020s",
        "First principles thinking becomes a standard framework taught in product management, startup, and engineering leadership programs"
      ]
    ],
    "timeline_zh": [
      [
        "约前350年",
        "亚里士多德在《形而上学》中将第一性原理（archai）定义为不可从其他命题推导出的知识基础"
      ],
      [
        "1687",
        "牛顿的《自然哲学的数学原理》通过从三条基本定律推导力学展示了第一性原理推理"
      ],
      [
        "2002",
        "埃隆·马斯克在SpaceX将第一性原理思维应用于火箭技术，通过分解到原材料成本来质疑火箭为何如此昂贵"
      ],
      [
        "2012",
        "马斯克在Tesla的著名电池成本分析公开展示了该方法：行业说电池成本为600美元/千瓦时，原材料的第一性原理分析显示80美元/千瓦时是可实现的"
      ],
      [
        "2020年代",
        "第一性原理思维成为产品管理、创业和工程领导力课程中的标准框架"
      ]
    ],
    "dos": [
      "Do write down every assumption explicitly before starting decomposition, because unexamined assumptions are invisible prisons for thinking",
      "Do verify your 'fundamental truths' are actually fundamental and not just deeply ingrained assumptions, because false bedrock leads to flawed reconstruction",
      "Do timebox the decomposition phase, because the value of first principles thinking comes from reconstruction, not infinite decomposition",
      "Do involve people from outside the domain in questioning assumptions, because domain experts have the deepest blind spots about their own conventions"
    ],
    "dos_zh": [
      "在开始分解前明确写下每一个假设，因为未被审视的假设是思维的无形牢笼",
      "验证你的「基本真理」确实是基本的而非只是根深蒂固的假设，因为错误的基石会导致有缺陷的重构",
      "为分解阶段设定时间盒，因为第一性原理思维的价值来自重构而非无限分解",
      "邀请领域外的人参与质疑假设，因为领域专家对自己的惯例有最深的盲区"
    ],
    "donts": [
      "Don't use first principles thinking for every decision, because most daily decisions are efficiently handled by analogy and pattern matching",
      "Don't confuse 'ignoring prior art' with 'ignoring prior learning', because understanding why things are done a certain way is essential before deciding to do them differently",
      "Don't skip the validation step, because elegant first-principles designs can overlook practical constraints that incrementalists discovered the hard way",
      "Don't fall into the trap of assuming your decomposition is complete, because missing a fundamental constraint can invalidate the entire reconstruction"
    ],
    "donts_zh": [
      "不要对每个决定都使用第一性原理思维，因为大多数日常决策通过类比和模式匹配能高效处理",
      "不要混淆「忽略先例」和「忽略前人经验」，因为在决定采取不同做法之前，理解事情为何如此做是必要的",
      "不要跳过验证步骤，因为优雅的第一性原理设计可能忽略了渐进式改进者通过艰难教训发现的实际约束",
      "不要陷入认为分解已经完整的陷阱，因为遗漏一个基本约束可能使整个重构失效"
    ],
    "case_study_company": "SpaceX",
    "case_study": "When founding SpaceX in 2002, Elon Musk faced launch costs of $65 million per flight. Instead of accepting industry pricing, he decomposed a rocket to its raw materials — aerospace-grade aluminum, titanium, copper, carbon fiber — and found the materials cost was only about 2% of the typical rocket price. By questioning every inherited assumption about rocket manufacturing (including the assumption that rockets must be expendable), SpaceX developed the Falcon 9 with reusable first stages, ultimately reducing launch costs to roughly $2,700 per kilogram to orbit.",
    "case_study_zh": "2002年创立SpaceX时，埃隆·马斯克面对每次发射6500万美元的成本。他没有接受行业定价，而是将火箭分解到原材料——航空级铝、钛、铜、碳纤维——发现材料成本仅为典型火箭价格的约2%。通过质疑火箭制造的每一个继承假设（包括火箭必须一次性使用的假设），SpaceX开发了具有可复用第一级的猎鹰9号，最终将发射成本降至约每公斤2700美元入轨。",
    "case_study_challenge": "In 2002, the aerospace industry quoted $65 million per launch — a price that had barely moved in decades. Every vendor Musk approached treated the cost structure as an immutable law of physics, not an engineering problem to be solved.",
    "case_study_challenge_zh": "2002年，航天工业报价每次发射6500万美元——这个价格几十年来几乎没有变化。马斯克接触的每一家供应商都将成本结构视为不可改变的物理定律，而非可以解决的工程问题。",
    "case_study_approach": "Musk decomposed a rocket to its raw materials — aerospace-grade aluminum, titanium, copper, carbon fiber — and calculated that material costs were roughly 2% of the sticker price. He then questioned every inherited assumption in the manufacturing chain, including the deepest one: that rockets must be expendable. SpaceX rebuilt the process from atoms up.",
    "case_study_approach_zh": "马斯克将火箭分解到原材料——航空级铝、钛、铜、碳纤维——计算出材料成本仅占标价的约2%。随后他逐一质疑制造链中的每一个继承假设，包括最根深蒂固的那个：火箭必须是一次性的。SpaceX从原子层面重建了整个流程。",
    "case_study_result": "The Falcon 9 with reusable first stages reduced launch costs to approximately $2,700 per kilogram to orbit — a 97% reduction from the industry baseline. The approach transformed space logistics from a government monopoly into a commercial market.",
    "case_study_result_zh": "搭载可复用第一级的猎鹰9号将发射成本降至约每公斤2700美元入轨——相比行业基准下降了97%。这一方法将太空物流从政府垄断转变为商业市场。",
    "case_study_quote": "I tend to approach things from a physics framework. Physics teaches you to reason from first principles rather than by analogy.",
    "case_study_quote_zh": "我倾向于用物理学框架来思考问题。物理学教你从第一性原理出发推理，而非依赖类比。",
    "when_not_to_use": [
      "When making routine, low-stakes decisions where the cost of first-principles analysis exceeds the potential benefit",
      "When working in a domain with well-established, empirically validated best practices that have already been optimized to near-theoretical limits",
      "When time-to-market is critical and an adequate existing solution can be adopted and iterated upon faster than a ground-up redesign",
      "When the problem is primarily a coordination or communication challenge, not a fundamental design problem"
    ],
    "when_not_to_use_zh": [
      "当做例行的低风险决策，第一性原理分析的成本超过潜在收益时",
      "当在有完善且经验证最佳实践的领域工作，这些实践已被优化至接近理论极限时",
      "当上市时间至关重要，采用并迭代已有的适当方案比从零重新设计更快时",
      "当问题主要是协调或沟通挑战，而非根本性设计问题时"
    ],
    "adopters": [
      "SpaceX",
      "Tesla",
      "Amazon",
      "Stripe",
      "Dyson",
      "Anduril"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Aristotle (circa 350 BCE). \"Metaphysics\". Classical philosophical text.",
    "secondary_sources": [
      "Shane Parrish (2019). \"The Great Mental Models Volume 1: General Thinking Concepts\". Latticework Publishing.",
      "Tim Urban (2015). \"The Cook and the Chef: Musk's Secret Sauce\". Wait But Why."
    ],
    "typed_relations": [
      {
        "slug": "analogical-thinking",
        "type": "complement"
      },
      {
        "slug": "six-thinking-hats",
        "type": "related"
      }
    ]
  },
  {
    "id": 6,
    "name": "Design by Contract (DbC)",
    "name_zh": "契约式设计",
    "slug": "design-by-contract",
    "category": "thinking",
    "desc": "Define explicit preconditions, postconditions, invariants per unit",
    "desc_zh": "为每个软件单元定义明确的前置条件、后置条件与不变式",
    "steps": [
      "Define Preconditions: specify what callers must guarantee to be true before invoking each function or service",
      "Define Postconditions: specify what the function or service guarantees to be true upon successful completion",
      "Define Class Invariants: state properties that must hold for every object or module in any observable state",
      "Encode Contracts in Code: express contracts as assertions, type annotations, or formal specs (e.g., OpenAPI, type guards)",
      "Enforce at Runtime and Test Time: run contract checks in development; use property-based tests to fuzz contract boundaries"
    ],
    "steps_zh": [
      "定义前置条件：明确调用者在调用每个函数或服务前必须保证为真的条件",
      "定义后置条件：明确函数或服务在成功完成后保证为真的条件",
      "定义类不变式：声明对象或模块在任何可观测状态下都必须成立的属性",
      "在代码中编码契约：通过断言、类型注解或形式规范（如OpenAPI、类型守卫）表达契约",
      "在运行时和测试时强制执行：开发中运行契约检查；使用基于属性的测试对契约边界进行模糊测试"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Precondition",
      "Body",
      "Postcondition",
      "Invariant"
    ],
    "viz_labels_zh": [
      "前置条件",
      "执行逻辑",
      "后置条件",
      "类不变量"
    ],
    "related": [
      "property-based-testing",
      "contract-testing",
      "clean-code-principles"
    ],
    "tags": [
      "contracts",
      "preconditions",
      "postconditions",
      "invariants",
      "correctness"
    ],
    "origin_author": "Bertrand Meyer, 1986",
    "origin_source": "Object-Oriented Software Construction (Bertrand Meyer, 1988/1997)",
    "origin_source_zh": "《面向对象软件构造》（Bertrand Meyer，1988/1997年）",
    "complexity": "intermediate",
    "when_to_use": [
      "When building mission-critical software (financial transactions, medical devices, infrastructure) where correctness is non-negotiable",
      "When designing public APIs or libraries consumed by other teams who need clear guarantees about input/output behavior",
      "When debugging is consuming excessive time because failure causes are far from failure symptoms in the call chain",
      "When onboarding new developers to a codebase and you need code that explicitly documents its own usage rules"
    ],
    "when_to_use_zh": [
      "当构建正确性不可妥协的关键任务软件（金融交易、医疗设备、基础设施）时",
      "当设计供其他团队使用的公共API或库，他们需要关于输入/输出行为的清晰保证时",
      "当调试耗费过多时间，因为故障原因与调用链中的故障症状相距甚远时",
      "当新开发者入职代码库，需要代码明确记录自身使用规则时"
    ],
    "core_concepts": [
      "Preconditions: Obligations the caller must satisfy before invoking a routine — if violated, the caller has a bug, not the routine",
      "Postconditions: Guarantees the routine promises to deliver upon normal completion — if violated, the routine has a bug, not the caller",
      "Class Invariants: Properties that must be true for all instances of a class between public method calls, defining what makes an object 'valid'",
      "Blame Assignment: Contracts create clear responsibility boundaries — a violated precondition is the caller's fault, a violated postcondition is the supplier's fault, eliminating ambiguity in bug attribution"
    ],
    "core_concepts_zh": [
      "前置条件：调用者在调用例程前必须满足的义务——如果违反，是调用者有缺陷，而非例程",
      "后置条件：例程承诺在正常完成时交付的保证——如果违反，是例程有缺陷，而非调用者",
      "类不变式：在公共方法调用之间必须为所有类实例成立的属性，定义什么使一个对象「有效」",
      "责任归属：契约创建清晰的责任边界——违反前置条件是调用者的错，违反后置条件是提供者的错，消除缺陷归因的模糊性"
    ],
    "timeline": [
      [
        "1969",
        "Tony Hoare introduces the concept of preconditions and postconditions in his paper on 'An Axiomatic Basis for Computer Programming'"
      ],
      [
        "1986",
        "Bertrand Meyer coins 'Design by Contract' and builds it into the Eiffel programming language as a first-class language feature"
      ],
      [
        "1988",
        "Meyer publishes 'Object-Oriented Software Construction', the comprehensive reference for DbC methodology"
      ],
      [
        "2003",
        "Java introduces assert statements; .NET adds Code Contracts library, bringing DbC concepts to mainstream languages"
      ],
      [
        "2010s",
        "Modern typed languages (TypeScript, Rust, Kotlin) embed contract-like thinking through advanced type systems, sum types, and non-nullable references"
      ]
    ],
    "timeline_zh": [
      [
        "1969",
        "Tony Hoare在「计算机编程的公理基础」论文中引入前置条件和后置条件的概念"
      ],
      [
        "1986",
        "Bertrand Meyer提出「契约式设计」一词，并将其作为一等语言特性内建于Eiffel编程语言中"
      ],
      [
        "1988",
        "Meyer出版《面向对象软件构造》，成为DbC方法论的全面参考"
      ],
      [
        "2003",
        "Java引入assert语句；.NET添加Code Contracts库，将DbC概念带入主流语言"
      ],
      [
        "2010年代",
        "现代类型化语言（TypeScript、Rust、Kotlin）通过高级类型系统、联合类型和非空引用嵌入类似契约的思维"
      ]
    ],
    "dos": [
      "Do make contracts as precise and machine-checkable as possible, because vague contracts in comments are ignored while assertions in code are enforced",
      "Do use contracts to clarify the boundary between caller and callee responsibility, because ambiguous boundaries are where the worst bugs hide",
      "Do keep contracts focused on 'what' not 'how', because contracts specify observable guarantees, not implementation details",
      "Do run contract checks in development and testing but consider disabling expensive checks in production, because contracts are primarily a design and debugging tool"
    ],
    "dos_zh": [
      "尽可能使契约精确且可机器检查，因为注释中的模糊契约会被忽略，而代码中的断言能被强制执行",
      "使用契约澄清调用者和被调用者的责任边界，因为模糊的边界是最严重缺陷的藏身之处",
      "保持契约聚焦于「是什么」而非「怎么做」，因为契约规定的是可观察的保证而非实现细节",
      "在开发和测试中运行契约检查，但考虑在生产环境禁用昂贵的检查，因为契约主要是设计和调试工具"
    ],
    "donts": [
      "Don't use contracts as a substitute for input validation on trust boundaries, because contracts assume trusted callers while external input is inherently untrusted",
      "Don't over-specify contracts to the point where implementation flexibility is eliminated, because overly tight contracts create brittle coupling between caller and callee",
      "Don't silently swallow contract violations, because a violated contract indicates a program bug that should be surfaced immediately, not papered over",
      "Don't apply DbC to trivially simple getters and setters, because the overhead of writing and maintaining contracts should be proportional to the complexity of the behavior"
    ],
    "donts_zh": [
      "不要将契约用作信任边界上输入验证的替代品，因为契约假定调用者可信，而外部输入本质上不可信",
      "不要过度指定契约到消除实现灵活性的程度，因为过紧的契约在调用者和被调用者之间创建脆弱的耦合",
      "不要默默吞掉契约违反，因为被违反的契约表示程序缺陷，应立即暴露而非掩盖",
      "不要对琐碎的getter和setter应用DbC，因为编写和维护契约的开销应与行为的复杂性成正比"
    ],
    "case_study_company": "Microsoft (Midori Project)",
    "case_study": "Microsoft's experimental Midori operating system project (2008-2015) was built from the ground up using Design by Contract principles in a language derived from C# with first-class contract support. Every API surface had machine-checked preconditions, postconditions, and invariants. The team reported that this approach eliminated entire categories of runtime errors and dramatically reduced debugging time, with contracts catching defects at compile time that would have been expensive production bugs in a traditional system.",
    "case_study_zh": "微软的实验性Midori操作系统项目（2008-2015）从零开始使用契约式设计原则构建，采用一种带有一等契约支持的C#衍生语言。每个API表面都有机器检查的前置条件、后置条件和不变式。团队报告该方法消除了整类运行时错误，大幅减少了调试时间，契约在编译期捕获的缺陷在传统系统中将会是昂贵的生产缺陷。",
    "when_not_to_use": [
      "When building a rapid prototype where design is expected to change daily and contracts would create excessive maintenance burden",
      "When working in a dynamically typed scripting environment with no assertion infrastructure, making contracts impractical to enforce",
      "When the performance overhead of runtime contract checking is unacceptable in the production hot path and the team lacks the discipline to separate development-time from production-time checks",
      "When the codebase is primarily glue code or configuration with trivial logic that doesn't benefit from formal contract specification"
    ],
    "when_not_to_use_zh": [
      "当构建快速原型，设计预计每天变化，契约会造成过度维护负担时",
      "当在没有断言基础设施的动态类型脚本环境中工作，契约无法实际执行时",
      "当运行时契约检查的性能开销在生产热路径中不可接受，且团队缺乏区分开发期和生产期检查的纪律时",
      "当代码库主要是胶水代码或配置，逻辑琐碎，不受益于正式契约规范时"
    ],
    "adopters": [
      "Eiffel Software",
      "Microsoft Research",
      "Bloomberg",
      "AdaCore",
      "JetBrains",
      "Google (Guava Preconditions)"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "reliability",
      "testability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Bertrand Meyer (1988). \"Object-Oriented Software Construction\". Prentice Hall.",
    "secondary_sources": [
      "Bertrand Meyer (1997). \"Object-Oriented Software Construction, 2nd Edition\". Prentice Hall.",
      "Bertrand Meyer (1992). \"Applying Design by Contract\". IEEE Computer, 25(10)."
    ],
    "typed_relations": [
      {
        "slug": "property-based-testing",
        "type": "complement"
      },
      {
        "slug": "contract-testing",
        "type": "related"
      },
      {
        "slug": "clean-code-principles",
        "type": "related"
      }
    ]
  },
  {
    "id": 7,
    "name": "Cynefin Framework",
    "name_zh": "库内文框架",
    "slug": "cynefin-framework",
    "category": "thinking",
    "desc": "Categorize problems into Simple, Complicated, Complex, Chaotic",
    "desc_zh": "将问题归类为简单、繁杂、复杂、混沌四个领域以选择对策",
    "steps": [
      "Categorize the Problem Domain: assess whether the situation is Simple (known), Complicated (expert-solvable), Complex (emergent), or Chaotic (novel/crisis)",
      "Select the Response Pattern: apply best practice (Simple), good practice with analysis (Complicated), or probe-sense-respond (Complex)",
      "Handle Disorder: if the domain is unclear, break the problem into smaller parts and classify each part separately",
      "Manage Domain Transitions: watch for situations moving between domains (e.g., a Complex system becoming Chaotic under load)",
      "Document Domain Reasoning: record why a problem was classified in a domain to inform future architectural decisions"
    ],
    "steps_zh": [
      "归类问题领域：判断当前情况属于简单（已知）、繁杂（专家可解）、复杂（涌现）还是混沌（新颖/危机）",
      "选择响应模式：简单领域用最佳实践，繁杂领域用专家分析，复杂领域用探测-感知-响应",
      "处理无序状态：若领域不明确，将问题拆分为更小部分并分别归类",
      "管理领域转换：监控问题在领域间的迁移（如复杂系统在负载下转变为混沌）",
      "记录领域推理：记录问题被归类到某领域的原因，以指导未来的架构决策"
    ],
    "ai_relevant": false,
    "viz_type": "quadrant",
    "viz_labels": [
      "Simple",
      "Complicated",
      "Complex",
      "Chaotic"
    ],
    "viz_labels_zh": [
      "显然",
      "繁杂",
      "复杂",
      "混沌"
    ],
    "related": [
      "systems-thinking",
      "trade-off-sliders",
      "wardley-mapping"
    ],
    "tags": [
      "decision-making",
      "complexity",
      "problem-classification",
      "sense-making"
    ],
    "origin_author": "Dave Snowden, 1999",
    "origin_source": "A Leader's Framework for Decision Making (Harvard Business Review, 2007)",
    "origin_source_zh": "《领导者的决策框架》（《哈佛商业评论》，2007年）",
    "complexity": "intermediate",
    "when_to_use": [
      "When your team applies the same process to every problem regardless of its nature, leading to overengineering simple tasks or underestimating complex ones",
      "When stakeholders demand a detailed upfront plan for a problem that is inherently emergent and unpredictable",
      "When a production incident occurs and you need to quickly decide whether to apply known fixes or enter experimental investigation mode",
      "When choosing between agile, waterfall, or hybrid methodologies and you need a principled basis for the choice"
    ],
    "when_to_use_zh": [
      "当团队不论问题性质如何都使用相同流程，导致简单任务过度工程化或低估复杂任务时",
      "当利益相关者对本质上涌现且不可预测的问题要求详细的前期计划时",
      "当生产事故发生，需要快速决定是应用已知修复还是进入实验性调查模式时",
      "当在敏捷、瀑布或混合方法论之间选择，需要一个有原则的选择依据时"
    ],
    "core_concepts": [
      "Four Domains + Disorder: Clear (obvious cause-effect), Complicated (discoverable cause-effect requiring expertise), Complex (cause-effect only visible in retrospect), and Chaotic (no perceivable cause-effect), plus Disorder when you don't know which domain applies",
      "Probe-Sense-Respond: The strategy for Complex domains — run safe-to-fail experiments, observe what emerges, and amplify what works rather than planning upfront",
      "Domain Dynamics: Problems can shift between domains (e.g., complacency in the Clear domain causes a cliff-edge fall into Chaos), requiring continuous monitoring",
      "Contextual Appropriateness: There is no single 'best' management approach — the optimal response depends entirely on which domain the problem inhabits"
    ],
    "core_concepts_zh": [
      "四领域+无序：清晰（明显的因果关系）、繁杂（需要专业知识才能发现的因果关系）、复杂（因果关系只能事后可见）和混沌（无可感知的因果关系），加上不知道适用哪个领域时的无序状态",
      "探测-感知-响应：复杂领域的策略——运行安全可失败的实验，观察涌现结果，放大有效的方法，而非提前规划",
      "领域动态：问题可以在领域间迁移（如在清晰领域的自满导致悬崖式跌入混沌），需要持续监控",
      "情境适当性：不存在单一「最佳」管理方法——最优响应完全取决于问题所处的领域"
    ],
    "timeline": [
      [
        "1999",
        "Dave Snowden develops the Cynefin framework while working at IBM's Institute for Knowledge Management"
      ],
      [
        "2003",
        "Snowden and Cynthia Kurtz publish 'The New Dynamics of Strategy', the first academic paper on Cynefin"
      ],
      [
        "2007",
        "Snowden and Mary Boone publish 'A Leader's Framework for Decision Making' in Harvard Business Review, bringing Cynefin to mainstream business"
      ],
      [
        "2010",
        "The Agile community widely adopts Cynefin to explain when Scrum (Complex) vs. Kanban (Complicated) vs. defined processes (Clear) are appropriate"
      ],
      [
        "2020",
        "Snowden updates the framework, renaming 'Simple' to 'Clear' and adding 'Aporetic' (confused/unknowable) to reflect evolving understanding of complexity"
      ]
    ],
    "timeline_zh": [
      [
        "1999",
        "Dave Snowden在IBM知识管理研究所工作期间开发了库内文框架"
      ],
      [
        "2003",
        "Snowden与Cynthia Kurtz发表「战略的新动力学」，这是关于库内文的首篇学术论文"
      ],
      [
        "2007",
        "Snowden与Mary Boone在《哈佛商业评论》发表「领导者的决策框架」，将库内文推向主流商业领域"
      ],
      [
        "2010",
        "敏捷社区广泛采用库内文来解释何时使用Scrum（复杂域）、Kanban（繁杂域）或定义流程（清晰域）"
      ],
      [
        "2020",
        "Snowden更新框架，将「简单」更名为「清晰」，并新增「迷惑」域以反映对复杂性理解的演进"
      ]
    ],
    "dos": [
      "Do use Cynefin as a sense-making tool before choosing a process methodology, because the domain should drive the approach rather than the other way around",
      "Do design safe-to-fail experiments for Complex domain problems, because small probes with bounded downside reveal patterns that analysis cannot",
      "Do reassess domain classification regularly, because problems move between domains as context changes and yesterday's Complex problem may be today's Complicated one",
      "Do teach the entire team to recognize domain signals, because frontline engineers often detect domain shifts (e.g., from Complicated to Chaotic) before managers do"
    ],
    "dos_zh": [
      "在选择流程方法论之前先使用库内文作为意义建构工具，因为领域应驱动方法而非反过来",
      "为复杂领域问题设计安全可失败的实验，因为有限下行风险的小探测能揭示分析无法发现的模式",
      "定期重新评估领域分类，因为问题会随情境变化在领域间移动，昨天的复杂问题可能是今天的繁杂问题",
      "教会整个团队识别领域信号，因为一线工程师往往比管理者更早察觉领域转换（如从繁杂到混沌）"
    ],
    "donts": [
      "Don't treat Cynefin domains as permanent labels, because a problem's domain classification can change as understanding deepens or conditions shift",
      "Don't apply best practices to Complex domain problems, because in complexity there are no repeatable best practices — only emergent patterns to discover",
      "Don't spend time analyzing during a Chaotic crisis, because Chaotic situations demand immediate action to stabilize before any analysis is possible",
      "Don't dismiss the Disorder zone, because the most dangerous state is not knowing which domain you're in and defaulting to comfortable but inappropriate responses"
    ],
    "donts_zh": [
      "不要将库内文领域视为永久标签，因为随着理解加深或条件变化，问题的领域分类可能改变",
      "不要对复杂领域问题应用最佳实践，因为在复杂性中没有可重复的最佳实践——只有待发现的涌现模式",
      "不要在混沌危机中花时间分析，因为混沌情况要求立即行动以稳定局势，在此之前任何分析都不可能",
      "不要忽视无序区域，因为最危险的状态是不知道自己在哪个领域并默认采用舒适但不当的响应方式"
    ],
    "case_study_company": "Spotify",
    "case_study": "Spotify used Cynefin thinking to shape their engineering culture and organizational model. They recognized that music recommendation algorithms operated in the Complex domain (requiring probe-sense-respond experimentation with A/B tests), while payment processing was Complicated (requiring expert analysis and good practices), and basic infrastructure provisioning was Clear (requiring standard operating procedures). This domain-aware approach led to their famous Squad/Tribe/Chapter/Guild model, where different teams used different processes matched to their problem domain.",
    "case_study_zh": "Spotify运用库内文思维来塑造其工程文化和组织模型。他们认识到音乐推荐算法运行在复杂领域（需要通过A/B测试进行探测-感知-响应实验），支付处理属于繁杂领域（需要专家分析和良好实践），而基础设施供应属于清晰领域（需要标准操作流程）。这种领域感知的方法催生了他们著名的小队/部落/分会/公会模型，不同团队使用与其问题领域匹配的不同流程。",
    "case_study_challenge": "Spotify's engineering organization was scaling rapidly, but a one-size-fits-all process was failing. Agile sprints worked well for infrastructure teams but stifled the exploratory experimentation that recommendation teams needed. Meanwhile, payment processing demanded rigor that felt bureaucratic to other groups.",
    "case_study_challenge_zh": "Spotify的工程组织在快速扩张，但统一流程正在失效。敏捷冲刺对基础设施团队效果良好，却扼杀了推荐团队所需的探索性实验。与此同时，支付处理要求的严谨性在其他团队看来过于官僚。",
    "case_study_approach": "Leadership applied Cynefin to classify each team's problem domain. Music recommendation — Complex: teams ran safe-to-fail A/B experiments with probe-sense-respond cycles. Payment processing — Complicated: expert analysis and established good practices. Infrastructure provisioning — Clear: standard operating procedures and checklists.",
    "case_study_approach_zh": "管理层运用库内文框架对每个团队的问题领域进行分类。音乐推荐属于复杂领域：团队开展安全失败的A/B实验，采用探测-感知-响应循环。支付处理属于繁杂领域：依赖专家分析和成熟的良好实践。基础设施供应属于清晰领域：遵循标准操作流程和检查清单。",
    "case_study_result": "This domain-aware thinking produced Spotify's celebrated Squad/Tribe/Chapter/Guild organizational model, where each team adopted processes matched to its problem complexity rather than following a company-wide mandate. Engineering autonomy and velocity improved across all domains simultaneously.",
    "case_study_result_zh": "这种领域感知思维催生了Spotify著名的小队/部落/分会/公会组织模型，每个团队采用与其问题复杂度匹配的流程，而非遵循全公司统一的指令。各领域的工程自主性和交付速度同步提升。",
    "case_study_quote": "The mistake is assuming all problems are the same kind. The framework gave us permission to treat different problems differently.",
    "case_study_quote_zh": "错误在于假设所有问题都是同一类。这个框架赋予我们以不同方式对待不同问题的许可。",
    "when_not_to_use": [
      "When the team needs a specific actionable methodology rather than a meta-framework for choosing methodologies",
      "When all problems in the project clearly fall within a single domain and domain classification adds no value",
      "When stakeholders want quantitative risk analysis and the qualitative nature of Cynefin doesn't satisfy their decision-making needs",
      "When the organization lacks psychological safety for safe-to-fail experiments, making Complex domain practices impractical"
    ],
    "when_not_to_use_zh": [
      "当团队需要具体可操作的方法论而非选择方法论的元框架时",
      "当项目中所有问题明显落在单一领域内，领域分类不增加价值时",
      "当利益相关者需要定量风险分析，库内文的定性特征无法满足其决策需求时",
      "当组织缺乏心理安全来进行安全可失败的实验，使复杂领域实践不可行时"
    ],
    "adopters": [
      "Spotify",
      "IBM",
      "UK Government Digital Service",
      "Australian Tax Office",
      "Lego",
      "Ericsson"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Dave Snowden and Mary Boone (2007). \"A Leader's Framework for Decision Making\". Harvard Business Review.",
    "secondary_sources": [
      "Dave Snowden (2000). \"Cynefin: A Sense of Time and Space\". Knowledge Management World.",
      "Dave Snowden and Zhen Goh (2020). \"Cynefin: Weaving Sense-Making into the Fabric of Our World\". Cognitive Edge."
    ],
    "typed_relations": [
      {
        "slug": "systems-thinking",
        "type": "complement"
      },
      {
        "slug": "trade-off-sliders",
        "type": "complement"
      },
      {
        "slug": "wardley-mapping",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 8,
    "name": "Wardley Mapping",
    "name_zh": "沃德利地图",
    "slug": "wardley-mapping",
    "category": "thinking",
    "desc": "Visualize value chains by evolution stage to drive strategy",
    "desc_zh": "按演化阶段可视化价值链，以驱动技术与商业战略决策",
    "steps": [
      "Anchor on User Need: place the user at the top and define the visible capability they need (the 'anchor')",
      "Build the Value Chain: map all components your system needs to deliver that capability, from user-facing to infrastructure",
      "Position by Evolution: place each component on the x-axis from Genesis → Custom → Product → Commodity based on its maturity",
      "Identify Strategic Moves: spot components that are over-engineered (custom when commodity exists) or under-invested",
      "Apply Climatic Patterns: use known evolution patterns (e.g., componentization, inertia, co-evolution) to anticipate change"
    ],
    "steps_zh": [
      "锚定用户需求：将用户置于顶部，定义其所需的可见能力（「锚点」）",
      "构建价值链：绘制系统交付该能力所需的所有组件，从面向用户到基础设施",
      "按演化阶段定位：根据成熟度将每个组件放置在X轴上（初创→定制→产品→商品）",
      "识别战略举措：发现过度工程化（已有商品化方案却仍定制）或投资不足的组件",
      "应用气候模式：利用已知演化规律（如组件化、惯性、协同演化）预判变化趋势"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "User Need",
      "Value Chain",
      "Evolution Axis",
      "Position"
    ],
    "viz_labels_zh": [
      "用户需求",
      "价值链",
      "演化轴",
      "定位"
    ],
    "related": [
      "cynefin-framework",
      "systems-thinking",
      "trade-off-sliders"
    ],
    "tags": [
      "strategy",
      "value-chain",
      "evolution",
      "mapping",
      "technology-decisions"
    ],
    "origin_author": "Simon Wardley, 2005",
    "origin_source": "Wardley Maps: Topographical Intelligence in Business (Simon Wardley, 2018, open Creative Commons book)",
    "origin_source_zh": "《沃德利地图：商业中的地形情报》（Simon Wardley，2018年，开放知识共享许可书籍）",
    "complexity": "advanced",
    "when_to_use": [
      "When making build-vs-buy decisions for infrastructure components and you need a principled way to assess which components to commoditize",
      "When planning a multi-year technology strategy and you need to anticipate which technologies will become commodities",
      "When a competitor's strategic moves seem surprising and you need a framework to understand the landscape they're operating in",
      "When allocating engineering resources across teams and you need to distinguish between components that need innovation vs. components that need reliability"
    ],
    "when_to_use_zh": [
      "当为基础设施组件做自建还是外购决策，需要一种有原则的方式来评估哪些组件应商品化时",
      "当规划多年技术战略，需要预判哪些技术将变为商品时",
      "当竞争对手的战略举措令人意外，需要一个框架来理解他们所处的竞争格局时",
      "当在各团队间分配工程资源，需要区分需要创新的组件与需要可靠性的组件时"
    ],
    "core_concepts": [
      "Value Chain: A vertical chain from user need (top) to underlying components (bottom), making dependencies explicit and revealing what actually delivers value",
      "Evolution Axis: Components evolve from Genesis (novel, uncertain) through Custom-Built and Product to Commodity (standardized, utility), and this evolution is inevitable",
      "Climatic Patterns: Predictable patterns of change (e.g., everything evolves toward commodity, success breeds inertia, higher-order systems create new needs) that enable strategic anticipation",
      "Gameplay: Deliberate strategic actions (open source a component to accelerate commoditization, use an ecosystem to create lock-in, exploit inertia in competitors) informed by the map"
    ],
    "core_concepts_zh": [
      "价值链：从用户需求（顶部）到底层组件（底部）的垂直链条，使依赖关系显式化并揭示真正交付价值的东西",
      "演化轴：组件从初创（新颖、不确定）经定制和产品演化至商品（标准化、实用），这种演化是不可避免的",
      "气候模式：可预测的变化规律（如一切向商品演化、成功滋生惯性、高阶系统创造新需求），能够支持战略预判",
      "博弈策略：由地图指导的深思熟虑的战略行动（开源某组件以加速商品化、利用生态系统创建锁定、利用竞争对手的惯性）"
    ],
    "timeline": [
      [
        "2005",
        "Simon Wardley creates the first Wardley Map while serving as CEO of Fotango, a Canon Europe subsidiary, to inform cloud strategy decisions"
      ],
      [
        "2008",
        "Wardley begins publicly sharing the mapping methodology through blog posts and conference talks, building an open-source community of practitioners"
      ],
      [
        "2014",
        "The UK Government Digital Service adopts Wardley Mapping for technology strategy, demonstrating applicability in large-scale public sector IT"
      ],
      [
        "2018",
        "Wardley publishes his comprehensive (and free) book on the methodology, making it widely accessible"
      ],
      [
        "2020s",
        "Wardley Mapping gains significant traction in platform engineering and cloud-native architecture communities as a tool for technology portfolio management"
      ]
    ],
    "timeline_zh": [
      [
        "2005",
        "Simon Wardley在担任佳能欧洲子公司Fotango的CEO时创建了第一张沃德利地图，用于指导云战略决策"
      ],
      [
        "2008",
        "Wardley开始通过博客文章和会议演讲公开分享映射方法论，建立实践者的开源社区"
      ],
      [
        "2014",
        "英国政府数字服务采用沃德利地图进行技术战略规划，证明了其在大规模公共部门IT中的适用性"
      ],
      [
        "2018",
        "Wardley出版全面且免费的方法论书籍，使其被广泛获取"
      ],
      [
        "2020年代",
        "沃德利地图在平台工程和云原生架构社区中作为技术组合管理工具获得显著关注"
      ]
    ],
    "dos": [
      "Do start with user needs at the top of the map, because every component only has value insofar as it ultimately serves a user need",
      "Do challenge the evolutionary position of each component with evidence, because misplacing a component (treating a commodity as custom) leads to wasted investment",
      "Do create maps collaboratively with both business and technical stakeholders, because the most valuable insights emerge from bridging their different perspectives",
      "Do update maps regularly as the landscape evolves, because a map is a snapshot in time and yesterday's Genesis component may be today's Commodity"
    ],
    "dos_zh": [
      "从地图顶部的用户需求开始，因为每个组件只有在最终服务于用户需求的范围内才有价值",
      "用证据挑战每个组件的演化位置，因为组件定位错误（将商品视为定制）会导致投资浪费",
      "与商业和技术利益相关者协作创建地图，因为最有价值的洞察来自桥接他们不同的视角",
      "随着格局演变定期更新地图，因为地图是时间快照，昨天的初创组件可能是今天的商品"
    ],
    "donts": [
      "Don't create maps in isolation without domain experts, because accurate component positioning requires deep knowledge of both technology maturity and market availability",
      "Don't treat the evolution axis as a precise measurement, because it's a relative positioning tool for strategic discussion, not a quantitative metric",
      "Don't map everything at once, because trying to capture the entire organization's value chain in one map creates an unreadable mess — focus on one user need at a time",
      "Don't use Wardley Maps as a one-time exercise, because their value comes from tracking movement along the evolution axis over time and adjusting strategy accordingly"
    ],
    "donts_zh": [
      "不要在没有领域专家的情况下独自创建地图，因为准确的组件定位需要对技术成熟度和市场可用性的深入了解",
      "不要将演化轴视为精确测量，因为它是用于战略讨论的相对定位工具，不是定量指标",
      "不要试图一次映射所有内容，因为在一张地图中捕获整个组织的价值链会创建不可读的混乱——一次聚焦一个用户需求",
      "不要将沃德利地图用作一次性练习，因为其价值来自随时间追踪演化轴上的移动并相应调整战略"
    ],
    "case_study_company": "UK Government Digital Service (GDS)",
    "case_study": "The UK Government Digital Service used Wardley Mapping to transform how the British government procures and builds technology. By mapping the value chains of government digital services, GDS identified that many departments were building custom solutions for components that had already evolved to commodity (e.g., identity verification, hosting, payments). This led to the creation of shared platforms like GOV.UK Pay, GOV.UK Notify, and GOV.UK Verify, saving hundreds of millions of pounds and dramatically improving service delivery speed.",
    "case_study_zh": "英国政府数字服务使用沃德利地图来变革英国政府采购和构建技术的方式。通过映射政府数字服务的价值链，GDS发现许多部门在为已经演化为商品的组件（如身份验证、托管、支付）构建定制方案。这促成了GOV.UK Pay、GOV.UK Notify和GOV.UK Verify等共享平台的创建，节省了数亿英镑并大幅提升了服务交付速度。",
    "when_not_to_use": [
      "When the scope is a single component with no meaningful value chain to map (e.g., choosing a logging library)",
      "When the team needs to make an immediate tactical decision and doesn't have time for strategic landscape analysis",
      "When the domain is so novel that no components have evolved past Genesis, making the evolution axis uninformative",
      "When stakeholders expect quantitative ROI projections and the qualitative nature of Wardley Maps won't satisfy their analytical requirements"
    ],
    "when_not_to_use_zh": [
      "当范围是单个没有有意义价值链可映射的组件（如选择日志库）时",
      "当团队需要做即时战术决策，没有时间进行战略格局分析时",
      "当领域如此新颖，没有组件演化超过初创阶段，使演化轴无法提供信息时",
      "当利益相关者期望定量的投资回报率预测，沃德利地图的定性特征无法满足其分析需求时"
    ],
    "adopters": [
      "UK Government Digital Service",
      "Leading Edge Forum",
      "Canonical (Ubuntu)",
      "Red Hat",
      "Thoughtworks",
      "HashiCorp"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability",
      "scalability"
    ],
    "maturity_ring": "established",
    "primary_source": "Simon Wardley (2018). \"Wardley Maps: Topographical Intelligence in Business\". Creative Commons.",
    "secondary_sources": [
      "Simon Wardley (2016). \"Wardley Mapping: An Iterative Process of Situational Awareness\". blog.gardeviance.org.",
      "Ben Mosior (2021). \"Wardley Mapping Canvas and Resources\". Hiredthought.com."
    ],
    "typed_relations": [
      {
        "slug": "cynefin-framework",
        "type": "alternative"
      },
      {
        "slug": "systems-thinking",
        "type": "complement"
      },
      {
        "slug": "trade-off-sliders",
        "type": "complement"
      }
    ]
  },
  {
    "id": 9,
    "name": "Problem Framing (How-Now-Wow Matrix)",
    "name_zh": "问题框架化（如何-现在-哇矩阵）",
    "slug": "problem-framing-how-now-wow",
    "category": "thinking",
    "desc": "Frame and prioritize ideas by feasibility and originality axes",
    "desc_zh": "通过可行性与原创性两轴框架化并优先排序设计创意",
    "steps": [
      "State the Challenge: write a clear, bounded challenge statement that frames the problem space without presupposing solutions",
      "Generate Ideas Freely: brainstorm solutions without filtering, aiming for high quantity across all feasibility and originality levels",
      "Place on the Matrix: map each idea onto a 2x2 grid of Originality (low/high) vs. Feasibility (low/high)",
      "Classify Quadrants: label ideas as 'Now' (easy/common), 'How' (novel/hard), 'Wow' (novel/feasible), or 'Discard' (easy/common)",
      "Prioritize 'Wow' Ideas: select 'Wow' quadrant ideas for prototyping and validate 'How' ideas for future investment"
    ],
    "steps_zh": [
      "陈述挑战：撰写清晰、有边界的挑战陈述，框定问题空间而不预设解决方案",
      "自由生成创意：不加筛选地头脑风暴，在所有可行性和原创性层面追求数量",
      "放置到矩阵中：将每个创意映射到原创性（低/高）与可行性（低/高）的2×2方格中",
      "分类各象限：将创意标记为「现在」（易/普通）、「如何」（新颖/难）、「哇」（新颖/可行）或「丢弃」",
      "优先选择「哇」类创意：选取「哇」象限的创意进行原型验证，评估「如何」类创意的未来投资价值"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "How (Unusual)",
      "Now (Common)",
      "Wow (Novel)",
      "Idea Fit"
    ],
    "viz_labels_zh": [
      "How非常规",
      "Now常规",
      "Wow新颖",
      "创意分类"
    ],
    "related": [
      "design-thinking-ideo",
      "six-thinking-hats",
      "jobs-to-be-done"
    ],
    "tags": [
      "ideation",
      "prioritization",
      "feasibility",
      "creativity"
    ],
    "origin_author": "The Gamestorming community / Dave Gray, Sunni Brown, James Macanufo, 2010",
    "origin_source": "Gamestorming: A Playbook for Innovators, Rulebreakers, and Changemakers (Dave Gray, Sunni Brown, James Macanufo, 2010)",
    "origin_source_zh": "《游戏风暴：创新者、规则破坏者和变革者的游戏手册》（Dave Gray、Sunni Brown、James Macanufo，2010年）",
    "complexity": "beginner",
    "when_to_use": [
      "When a brainstorming session has generated dozens of ideas and the team needs a fast, visual way to prioritize them",
      "When stakeholders argue about which ideas to pursue and you need an objective framework that separates novelty from feasibility",
      "When a product team's backlog is full of incremental improvements but lacks breakthrough innovation and needs to identify 'Wow' opportunities",
      "When running a design sprint and you need to narrow from many ideas to a few prototypable concepts within a single session"
    ],
    "when_to_use_zh": [
      "当头脑风暴产生了数十个创意，团队需要一种快速、直观的方式来排列优先级时",
      "当利益相关者争论该追求哪些创意，需要一个区分新颖性与可行性的客观框架时",
      "当产品团队的待办列表满是渐进改进但缺乏突破性创新，需要识别「哇」级机会时",
      "当进行设计冲刺，需要在单次会议中从众多创意缩减到少数可原型化概念时"
    ],
    "core_concepts": [
      "Two-Axis Evaluation: Plotting ideas on Originality (vertical) vs. Feasibility (horizontal) forces teams to evaluate both dimensions simultaneously rather than defaulting to only one",
      "Wow Quadrant: Ideas that are both novel and feasible represent the highest-value opportunities — innovative enough to differentiate but practical enough to ship",
      "How Quadrant: High-originality but low-feasibility ideas are not discarded but parked for future investment when technology, budget, or capability evolves",
      "Separation of Generation and Evaluation: Brainstorming (divergent) must be completed before matrix placement (convergent) to prevent premature filtering of creative ideas"
    ],
    "core_concepts_zh": [
      "双轴评估：将创意绘制在原创性（纵轴）与可行性（横轴）上，迫使团队同时评估两个维度，而非只默认其中一个",
      "哇象限：既新颖又可行的创意代表最高价值机会——足够创新以形成差异化，又足够实际以付诸交付",
      "如何象限：高原创性但低可行性的创意不被丢弃，而是作为未来投资机会存档，等待技术、预算或能力的演进",
      "生成与评估分离：头脑风暴（发散）必须在矩阵放置（收敛）之前完成，以防止过早筛选创造性想法"
    ],
    "timeline": [
      [
        "1967",
        "Alex Osborn's brainstorming principles and Sidney Parnes' Creative Problem Solving process establish the foundation for structured ideation techniques"
      ],
      [
        "2001",
        "The How-Now-Wow matrix concept emerges from innovation facilitation practices as a lightweight alternative to complex scoring models"
      ],
      [
        "2010",
        "Dave Gray, Sunni Brown, and James Macanufo publish 'Gamestorming', popularizing the matrix as a standard facilitation tool alongside other visual thinking games"
      ],
      [
        "2016",
        "Design sprint methodology (Jake Knapp / Google Ventures) incorporates similar prioritization matrices, bringing the technique into product development mainstream"
      ],
      [
        "2020s",
        "Remote collaboration tools (Miro, FigJam, MURAL) include How-Now-Wow templates as built-in features, making the technique accessible to distributed teams"
      ]
    ],
    "timeline_zh": [
      [
        "1967",
        "Alex Osborn的头脑风暴原则和Sidney Parnes的创造性问题解决流程为结构化构思技术奠定基础"
      ],
      [
        "2001",
        "如何-现在-哇矩阵概念从创新引导实践中出现，作为复杂评分模型的轻量替代"
      ],
      [
        "2010",
        "Dave Gray、Sunni Brown和James Macanufo出版《游戏风暴》，将该矩阵与其他视觉思维游戏一起推广为标准引导工具"
      ],
      [
        "2016",
        "设计冲刺方法论（Jake Knapp / Google Ventures）融入类似的优先级矩阵，将该技术带入产品开发主流"
      ],
      [
        "2020年代",
        "远程协作工具（Miro、FigJam、MURAL）将如何-现在-哇模板作为内置功能，使分布式团队也能便捷使用"
      ]
    ],
    "dos": [
      "Do enforce a strict 'no criticism' rule during brainstorming before matrix placement, because premature evaluation kills the novel ideas that would land in the Wow quadrant",
      "Do use dot voting or silent individual placement before group discussion, because group dynamics tend to anchor on the first loud opinion",
      "Do revisit the 'How' quadrant periodically, because technological advances may make previously infeasible ideas suddenly buildable",
      "Do calibrate 'feasibility' against your team's actual capabilities and timeline, because what's feasible for a 50-person team in 6 months differs from a 3-person team in 2 weeks"
    ],
    "dos_zh": [
      "在矩阵放置前严格执行头脑风暴的「禁止批评」规则，因为过早评估会扼杀本可落入哇象限的新颖想法",
      "在小组讨论前使用圆点投票或个人静默放置，因为群体动态倾向于锚定在第一个大声说出的意见上",
      "定期重新审视「如何」象限，因为技术进步可能使之前不可行的创意突然变得可构建",
      "根据团队的实际能力和时间线校准「可行性」，因为50人团队在6个月内可行的事情与3人团队在2周内可行的不同"
    ],
    "donts": [
      "Don't confuse 'Now' ideas with the best ideas, because easy and obvious solutions are often incremental and won't create competitive differentiation",
      "Don't let a single person place all the ideas on the matrix, because diverse perspectives on feasibility and originality produce more accurate placement",
      "Don't skip the challenge statement step, because without a clear problem frame, ideas scatter across unrelated problem spaces and the matrix becomes meaningless",
      "Don't use the matrix as the final decision tool, because placement is subjective and the top 'Wow' ideas still need validation through prototyping and testing"
    ],
    "donts_zh": [
      "不要将「现在」类创意等同于最佳创意，因为容易且显而易见的方案通常是渐进的，不会创造竞争差异化",
      "不要让一个人独自将所有创意放置到矩阵上，因为对可行性和原创性的多元视角能产生更准确的放置",
      "不要跳过挑战陈述步骤，因为没有清晰的问题框架，创意会分散到无关的问题空间，矩阵变得无意义",
      "不要将矩阵作为最终决策工具，因为放置是主观的，排名最高的「哇」级创意仍需通过原型和测试来验证"
    ],
    "case_study_company": "Google Ventures",
    "case_study": "Google Ventures (GV) incorporates How-Now-Wow style prioritization in their Design Sprint methodology, which they've run with over 150 startups. In one notable sprint with Blue Bottle Coffee, the team generated over 50 ideas for improving the online ordering experience. Using matrix-based prioritization, they identified a 'Wow' idea — a guided coffee taste quiz that recommended beans based on flavor preferences — which was prototyped and tested with users in just 5 days, leading to a significant increase in conversion rate after implementation.",
    "case_study_zh": "Google Ventures（GV）在其设计冲刺方法论中融入了如何-现在-哇风格的优先级排序，已与150多家初创公司合作实施。在与Blue Bottle Coffee的一次值得注意的冲刺中，团队为改善在线订购体验产生了50多个创意。通过基于矩阵的优先级排序，他们识别出一个「哇」级创意——一个根据口味偏好推荐咖啡豆的引导式品味测验——在短短5天内完成原型制作和用户测试，实施后显著提升了转化率。",
    "when_not_to_use": [
      "When you have fewer than 5 ideas to evaluate, because the matrix adds overhead without value for small option sets",
      "When the decision criteria are purely quantitative (cost, performance benchmarks) and subjective originality assessment is irrelevant",
      "When the team hasn't done adequate problem research and is brainstorming solutions to the wrong problem",
      "When all ideas are equally constrained by a single blocking factor (e.g., regulatory approval), making feasibility assessment trivial"
    ],
    "when_not_to_use_zh": [
      "当待评估的创意少于5个时，因为矩阵对小选项集增加了无价值的开销",
      "当决策标准纯粹是定量的（成本、性能基准），主观的原创性评估不相关时",
      "当团队尚未做充分的问题研究，正在为错误的问题头脑风暴解决方案时",
      "当所有创意同样受到单一阻断因素（如监管审批）的约束，使可行性评估变得平凡时"
    ],
    "adopters": [
      "Google Ventures",
      "IDEO",
      "Spotify",
      "Atlassian",
      "Salesforce",
      "SAP"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Dave Gray, Sunni Brown, and James Macanufo (2010). \"Gamestorming: A Playbook for Innovators, Rulebreakers, and Changemakers\". O'Reilly Media.",
    "secondary_sources": [
      "Edward de Bono (1992). \"Serious Creativity: Using the Power of Lateral Thinking to Create New Ideas\". HarperBusiness.",
      "Luma Institute (2012). \"Innovating for People: Handbook of Human-Centered Design Methods\". LUMA Institute."
    ],
    "typed_relations": [
      {
        "slug": "design-thinking-ideo",
        "type": "complement"
      },
      {
        "slug": "six-thinking-hats",
        "type": "related"
      },
      {
        "slug": "jobs-to-be-done",
        "type": "related"
      }
    ]
  },
  {
    "id": 10,
    "name": "Six Thinking Hats",
    "name_zh": "六顶思考帽",
    "slug": "six-thinking-hats",
    "category": "thinking",
    "desc": "Parallel thinking method using 6 cognitive perspective modes",
    "desc_zh": "采用六种认知视角模式的平行思维决策方法",
    "steps": [
      "Set the Focus Question: define the decision or design problem that all participants will examine together",
      "White Hat (Data): present all known facts, data, and information gaps without interpretation",
      "Red Hat (Emotion) + Black Hat (Caution) + Yellow Hat (Optimism): surface gut reactions, risks, and benefits in dedicated turns",
      "Green Hat (Creativity): generate alternative ideas and creative possibilities without criticism",
      "Blue Hat (Process): synthesize all perspectives, identify consensus, and define next actions"
    ],
    "steps_zh": [
      "设定焦点问题：定义所有参与者将共同检视的决策或设计问题",
      "白帽（数据）：不加诠释地呈现所有已知事实、数据和信息缺口",
      "红帽（情感）+黑帽（谨慎）+黄帽（乐观）：在专项轮次中分别表达直觉反应、风险与收益",
      "绿帽（创意）：在无批判氛围下生成替代创意与创造性可能性",
      "蓝帽（流程）：综合所有视角，识别共识，定义后续行动"
    ],
    "ai_relevant": false,
    "viz_type": "radar",
    "viz_labels": [
      "White (Facts)",
      "Red (Emotion)",
      "Black (Caution)",
      "Yellow (Positive)",
      "Green (Creative)",
      "Blue (Process)"
    ],
    "viz_labels_zh": [
      "白帽事实",
      "红帽情感",
      "黑帽批判",
      "黄帽乐观",
      "绿帽创意",
      "蓝帽流程"
    ],
    "related": [
      "problem-framing-how-now-wow",
      "first-principles-thinking",
      "trade-off-sliders"
    ],
    "tags": [
      "decision-making",
      "perspectives",
      "collaboration",
      "parallel-thinking"
    ],
    "origin_author": "Edward de Bono, 1985",
    "origin_source": "Six Thinking Hats (Edward de Bono, 1985)",
    "origin_source_zh": "《六顶思考帽》（爱德华·德博诺，1985年）",
    "complexity": "beginner",
    "when_to_use": [
      "When meetings devolve into adversarial debates where people argue positions rather than exploring the problem from multiple angles",
      "When a design review is dominated by a few vocal critics and quieter team members' perspectives (optimism, creativity, data) go unheard",
      "When a team needs to make a consequential architectural decision and wants to ensure all cognitive angles are systematically covered",
      "When post-mortems become blame sessions and you need a structured way to separate factual analysis from emotional reactions"
    ],
    "when_to_use_zh": [
      "当会议退化为对抗性辩论，人们争论立场而非从多角度探索问题时",
      "当设计评审被少数直言不讳的批评者主导，安静团队成员的视角（乐观、创意、数据）未被听到时",
      "当团队需要做出重大架构决策，希望确保系统性地覆盖所有认知角度时",
      "当事后复盘变成指责大会，需要一种结构化方式来分离事实分析和情绪反应时"
    ],
    "core_concepts": [
      "Parallel Thinking: All participants wear the same 'hat' simultaneously, thinking in the same direction together, eliminating adversarial argument and ego-driven debate",
      "Cognitive Mode Separation: Deliberately separating data (White), emotion (Red), caution (Black), optimism (Yellow), creativity (Green), and process (Blue) prevents them from muddling each other",
      "Legitimate Emotional Input: The Red Hat gives explicit permission to express feelings and intuition without justification, acknowledging that emotion is a valid input to decision-making",
      "Structured Exploration: By sequencing hats deliberately, a facilitator ensures every perspective gets airtime regardless of team dynamics or personality dominance"
    ],
    "core_concepts_zh": [
      "平行思维：所有参与者同时戴同一顶「帽子」，朝同一方向一起思考，消除对抗性争论和自我驱动的辩论",
      "认知模式分离：刻意分离数据（白帽）、情感（红帽）、谨慎（黑帽）、乐观（黄帽）、创意（绿帽）和流程（蓝帽），防止它们相互混淆",
      "合法的情感输入：红帽明确允许不加辩解地表达感受和直觉，承认情感是决策的有效输入",
      "结构化探索：通过有意编排帽子顺序，引导者确保每个视角都获得发言时间，不受团队动态或个性支配的影响"
    ],
    "timeline": [
      [
        "1985",
        "Edward de Bono publishes 'Six Thinking Hats', introducing the parallel thinking methodology"
      ],
      [
        "1992",
        "The method gains widespread corporate adoption after de Bono conducts training programs at major multinational corporations"
      ],
      [
        "2005",
        "Six Thinking Hats is reported to have reduced meeting times by 50-80% at organizations like Prudential Insurance and DuPont"
      ],
      [
        "2010s",
        "The technique is adapted for software retrospectives and design reviews in Agile teams, often combined with other facilitation methods"
      ],
      [
        "2021",
        "Edward de Bono passes away; the method remains one of the most widely taught creative thinking techniques in business education worldwide"
      ]
    ],
    "timeline_zh": [
      [
        "1985",
        "爱德华·德博诺出版《六顶思考帽》，引入平行思维方法论"
      ],
      [
        "1992",
        "在德博诺为大型跨国公司进行培训后，该方法获得广泛的企业采用"
      ],
      [
        "2005",
        "报告显示六顶思考帽在保德信保险和杜邦等组织将会议时间减少了50-80%"
      ],
      [
        "2010年代",
        "该技术被调整用于敏捷团队的软件回顾和设计评审，通常与其他引导方法结合使用"
      ],
      [
        "2021",
        "爱德华·德博诺去世；该方法仍是全球商业教育中最广泛教授的创造性思维技术之一"
      ]
    ],
    "dos": [
      "Do enforce one hat at a time for all participants, because parallel thinking only works when everyone explores the same cognitive mode simultaneously",
      "Do use a designated facilitator wearing the Blue Hat, because someone needs to manage hat transitions and ensure balanced time allocation",
      "Do explicitly invite Red Hat contributions from everyone, because people often suppress intuition in technical discussions and gut feelings can surface important risks",
      "Do timebox each hat to 3-5 minutes, because time pressure forces concise contributions and prevents any single perspective from dominating the session"
    ],
    "dos_zh": [
      "强制所有参与者同一时间戴同一顶帽子，因为平行思维只在所有人同时探索相同认知模式时才有效",
      "指定一位戴蓝帽的引导者，因为需要有人管理帽子切换并确保平衡的时间分配",
      "明确邀请每个人贡献红帽观点，因为人们常在技术讨论中压制直觉，而直觉感受能浮现重要风险",
      "将每顶帽子限定在3-5分钟内，因为时间压力迫使简洁的贡献并防止任何单一视角主导会议"
    ],
    "donts": [
      "Don't allow participants to wear different hats simultaneously, because mixing critique (Black) with ideation (Green) kills creative ideas before they're fully formed",
      "Don't skip the Red Hat, because unexpressed emotional resistance to a decision will surface later as passive resistance or sabotage",
      "Don't use Six Thinking Hats for every small decision, because the structured format adds overhead that's only justified for decisions with significant consequences",
      "Don't let the Black Hat dominate, because risk-averse cultures naturally over-index on caution and the Black Hat can become a tool for blocking rather than informing"
    ],
    "donts_zh": [
      "不允许参与者同时戴不同的帽子，因为将批判（黑帽）与构思（绿帽）混在一起会在创意完全成形前扼杀它们",
      "不要跳过红帽，因为对决策未表达的情感抵触会在后续以消极抵抗或破坏的形式浮现",
      "不要对每个小决策都使用六顶思考帽，因为结构化格式增加的开销只对有重大后果的决策才有正当性",
      "不要让黑帽主导，因为风险厌恶的文化天然在谨慎上过度偏重，黑帽可能变成阻碍而非提供信息的工具"
    ],
    "case_study_company": "DuPont",
    "case_study": "DuPont adopted Six Thinking Hats across their organization in the early 2000s as part of their innovation and decision-making culture reform. Previously, their senior leadership meetings were characterized by lengthy adversarial debates. After implementing the method, DuPont reported that typical meeting durations dropped from multiple hours to under 45 minutes for the same agenda items, while the quality of decisions improved because all perspectives (data, risk, emotion, creativity) were systematically explored rather than the loudest voice winning.",
    "case_study_zh": "杜邦在2000年代初作为创新和决策文化改革的一部分，在整个组织中采用了六顶思考帽。此前，他们的高管会议以冗长的对抗性辩论为特征。实施该方法后，杜邦报告相同议程项目的典型会议时长从数小时降至45分钟以内，同时决策质量提升，因为所有视角（数据、风险、情感、创意）都被系统性地探索，而非最大声的人获胜。",
    "when_not_to_use": [
      "When the decision is trivial and doesn't warrant the overhead of a structured multi-perspective exercise",
      "When only one or two people are involved, because the method's primary value is in coordinating group cognition",
      "When the team needs deep analytical investigation rather than broad perspective exploration",
      "When cultural norms make it unsafe to express Red Hat (emotional) or Black Hat (critical) views openly, undermining the method's core mechanism"
    ],
    "when_not_to_use_zh": [
      "当决策琐碎，不值得结构化多视角练习的开销时",
      "当只有一两个人参与时，因为该方法的主要价值在于协调群体认知",
      "当团队需要深入分析调查而非广泛视角探索时",
      "当文化规范使人无法安全地公开表达红帽（情感）或黑帽（批判）观点，损害了该方法的核心机制时"
    ],
    "adopters": [
      "DuPont",
      "Prudential",
      "IBM",
      "Siemens",
      "NASA",
      "British Airways"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Edward de Bono (1985). \"Six Thinking Hats\". Little, Brown and Company.",
    "secondary_sources": [
      "Edward de Bono (1970). \"Lateral Thinking: Creativity Step by Step\". Harper & Row.",
      "Edward de Bono (1992). \"Serious Creativity: Using the Power of Lateral Thinking to Create New Ideas\". HarperBusiness."
    ],
    "typed_relations": [
      {
        "slug": "problem-framing-how-now-wow",
        "type": "complement"
      },
      {
        "slug": "first-principles-thinking",
        "type": "alternative"
      },
      {
        "slug": "trade-off-sliders",
        "type": "complement"
      }
    ]
  },
  {
    "id": 11,
    "name": "Analogical Thinking",
    "name_zh": "类比思维",
    "slug": "analogical-thinking",
    "category": "thinking",
    "desc": "Transfer structural solutions from source domains to software",
    "desc_zh": "将源领域的结构性解决方案迁移到软件设计问题中",
    "steps": [
      "Abstract the Problem Structure: strip away domain-specific details to reveal the underlying relational pattern of the problem",
      "Identify Source Domains: brainstorm non-software domains (biology, logistics, city planning, etc.) that exhibit similar structural patterns",
      "Extract the Structural Mapping: articulate the precise correspondence between source domain elements and target software elements",
      "Adapt the Solution: translate the source domain's solution mechanism into software constructs, adjusting for domain differences",
      "Validate the Analogy: test where the analogy breaks down and ensure those gaps are addressed in the final design"
    ],
    "steps_zh": [
      "抽象问题结构：剥离领域特定细节，揭示问题的底层关系模式",
      "识别源领域：在非软件领域（生物学、物流、城市规划等）中寻找具有相似结构模式的案例",
      "提取结构映射：清晰阐明源领域元素与目标软件元素之间的精确对应关系",
      "改造解决方案：将源领域的解决机制转化为软件构件，并针对领域差异进行调整",
      "验证类比有效性：测试类比在何处失效，确保最终设计中这些缺口已被充分处理"
    ],
    "ai_relevant": false,
    "viz_type": "venn",
    "viz_labels": [
      "Source Domain",
      "Target Domain",
      "Structural Mapping"
    ],
    "viz_labels_zh": [
      "源域",
      "目标域",
      "结构映射"
    ],
    "related": [
      "first-principles-thinking",
      "design-thinking-ideo",
      "gof-design-patterns"
    ],
    "tags": [
      "analogy",
      "cross-domain",
      "pattern-transfer",
      "creative-thinking"
    ],
    "origin_author": "Dedre Gentner, 1983 (Structure-Mapping Theory); broadly rooted in cognitive science",
    "origin_source": "Structure-Mapping: A Theoretical Framework for Analogy (Dedre Gentner, Cognitive Science, 1983)",
    "origin_source_zh": "《结构映射：类比的理论框架》（Dedre Gentner，《认知科学》，1983年）",
    "complexity": "intermediate",
    "when_to_use": [
      "When the team is stuck on a design problem and exhausted all obvious approaches within the software domain",
      "When explaining a complex technical architecture to non-technical stakeholders who need intuitive understanding",
      "When designing a novel system and you want to leverage proven patterns from biology, urban planning, or other mature domains",
      "When creating developer documentation or API naming and you need metaphors that make abstract concepts immediately graspable"
    ],
    "when_to_use_zh": [
      "当团队在设计问题上陷入困境，已耗尽软件领域内所有显而易见的方法时",
      "当向需要直观理解的非技术利益相关者解释复杂技术架构时",
      "当设计新系统并希望利用生物学、城市规划或其他成熟领域的经验证模式时",
      "当创建开发者文档或API命名，需要能使抽象概念立即可理解的比喻时"
    ],
    "core_concepts": [
      "Structure Mapping: Analogical reasoning works by mapping relational structures (not surface features) from a well-understood source domain to a less-understood target domain",
      "Source Domain Selection: The power of an analogy depends on choosing a source domain with deep structural similarity to the target, not just superficial resemblance",
      "Analogical Distance: Far analogies (biology to software) tend to produce more creative breakthroughs than near analogies (web app to mobile app), but require more validation",
      "Analogy Limits: Every analogy eventually breaks down, and identifying where the mapping fails is as important as identifying where it works, because unexamined analogies mislead"
    ],
    "core_concepts_zh": [
      "结构映射：类比推理通过将关系结构（而非表面特征）从被充分理解的源领域映射到较少理解的目标领域来工作",
      "源领域选择：类比的力量取决于选择与目标具有深层结构相似性的源领域，而非仅仅表面相似",
      "类比距离：远距类比（生物学到软件）往往比近距类比（Web应用到移动应用）产生更多创造性突破，但需要更多验证",
      "类比局限：每个类比最终都会失效，识别映射在哪里失败与识别在哪里有效同样重要，因为未经检验的类比会产生误导"
    ],
    "timeline": [
      [
        "1983",
        "Dedre Gentner publishes Structure-Mapping Theory, providing the first rigorous cognitive science framework for how analogical reasoning works"
      ],
      [
        "1994",
        "The Gang of Four publishes 'Design Patterns', which is fundamentally analogical thinking applied to software — patterns named after architectural and real-world analogies"
      ],
      [
        "2004",
        "Keith Holyoak and Paul Thagard publish research on analogical reasoning in innovation, showing how cross-domain analogies drive breakthrough inventions"
      ],
      [
        "2010s",
        "Biomimicry becomes a recognized design discipline, systematically applying biological analogies to engineering and software design problems"
      ],
      [
        "2020s",
        "AI/ML systems increasingly use analogical reasoning (few-shot learning, transfer learning) as a core mechanism, renewing interest in the cognitive foundations"
      ]
    ],
    "timeline_zh": [
      [
        "1983",
        "Dedre Gentner发表结构映射理论，提供了第一个关于类比推理如何工作的严谨认知科学框架"
      ],
      [
        "1994",
        "四人帮出版《设计模式》，本质上是类比思维在软件中的应用——模式以建筑和现实世界类比命名"
      ],
      [
        "2004",
        "Keith Holyoak和Paul Thagard发表关于创新中类比推理的研究，展示跨领域类比如何驱动突破性发明"
      ],
      [
        "2010年代",
        "仿生学成为公认的设计学科，系统性地将生物类比应用于工程和软件设计问题"
      ],
      [
        "2020年代",
        "AI/ML系统越来越多地使用类比推理（小样本学习、迁移学习）作为核心机制，重新引发了对认知基础的兴趣"
      ]
    ],
    "dos": [
      "Do map the structural relationships precisely, because vague analogies ('it's like a city') are useless — specify exactly which elements correspond to what",
      "Do seek analogies from distant domains, because cross-domain analogies (biology, logistics, ecology) produce more creative solutions than within-domain comparisons",
      "Do explicitly document where the analogy breaks down, because every analogy has limits and unexamined breakpoints become hidden design flaws",
      "Do use analogies to communicate as well as design, because a well-chosen analogy can make a complex architecture immediately understandable to new team members"
    ],
    "dos_zh": [
      "精确映射结构关系，因为模糊的类比（「它像一座城市」）毫无用处——要具体说明哪些元素对应什么",
      "从远距领域寻找类比，因为跨领域类比（生物学、物流、生态学）比领域内比较产生更多创造性方案",
      "明确记录类比在哪里失效，因为每个类比都有局限，未被检查的断裂点会成为隐藏的设计缺陷",
      "将类比同时用于沟通和设计，因为恰当选择的类比能使复杂架构立即为新团队成员所理解"
    ],
    "donts": [
      "Don't fall in love with the analogy and force-fit the target domain into the source domain's structure, because the map is not the territory",
      "Don't use surface-level analogies for design decisions, because similar-sounding domains can have fundamentally different structural properties",
      "Don't stop at one analogy, because comparing multiple source domains highlights which structural features are robust and which are artifacts of a particular analogy",
      "Don't assume the analogy communicates the same thing to everyone, because different team members may interpret the same metaphor differently based on their domain knowledge"
    ],
    "donts_zh": [
      "不要迷恋类比并将目标领域强行套入源领域的结构，因为地图不是领土",
      "不要用表面层次的类比来做设计决策，因为听起来相似的领域可能具有根本不同的结构属性",
      "不要满足于一个类比，因为比较多个源领域能揭示哪些结构特征是稳健的，哪些只是特定类比的产物",
      "不要假设类比对每个人传达相同的含义，因为不同团队成员可能根据其领域知识对同一比喻做出不同解读"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix's Chaos Engineering practice originated from an analogy to immunology. Just as the immune system strengthens through controlled exposure to pathogens, Netflix designed Chaos Monkey (2011) to randomly terminate production instances, building system resilience through controlled failure injection. This biological analogy guided the design of increasingly sophisticated tools (Chaos Kong, FIT) that simulate progressively larger failures, directly mirroring how immune systems develop from handling small antigens to mounting complex immune responses.",
    "case_study_zh": "Netflix的混沌工程实践源于对免疫学的类比。正如免疫系统通过受控接触病原体来增强，Netflix设计了Chaos Monkey（2011年）来随机终止生产实例，通过受控故障注入来构建系统韧性。这个生物学类比指导了越来越复杂的工具（Chaos Kong、FIT）的设计，模拟逐渐更大的故障，直接映射了免疫系统从处理小抗原到发起复杂免疫反应的发展过程。",
    "when_not_to_use": [
      "When the problem has a well-known solution within the software domain and cross-domain exploration would add unnecessary complexity",
      "When precision is required and an analogy's inherent imprecision would lead to specification errors in safety-critical systems",
      "When the team lacks shared knowledge of the source domain, making the analogy confusing rather than illuminating",
      "When the problem is primarily a quantitative optimization (latency reduction, memory efficiency) where mathematical analysis is more appropriate than structural analogy"
    ],
    "when_not_to_use_zh": [
      "当问题在软件领域内有公认的解决方案，跨领域探索会增加不必要的复杂性时",
      "当需要精确性，类比固有的不精确性会在安全关键系统中导致规格错误时",
      "当团队缺乏对源领域的共同知识，使类比造成困惑而非启发时",
      "当问题主要是定量优化（延迟降低、内存效率），数学分析比结构类比更合适时"
    ],
    "adopters": [
      "Netflix",
      "Google (MapReduce analogy from functional programming)",
      "Kubernetes (ship steering/helmsman analogy)",
      "Docker (shipping container analogy)",
      "Apache Kafka (commit log analogy from databases)"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Dedre Gentner (1983). \"Structure-Mapping: A Theoretical Framework for Analogy\". Cognitive Science, 7(2).",
    "secondary_sources": [
      "Douglas Hofstadter and Emmanuel Sander (2013). \"Surfaces and Essences: Analogy as the Fuel and Fire of Thinking\". Basic Books.",
      "Keith Holyoak and Paul Thagard (1995). \"Mental Leaps: Analogy in Creative Thought\". MIT Press."
    ],
    "typed_relations": [
      {
        "slug": "first-principles-thinking",
        "type": "complement"
      },
      {
        "slug": "design-thinking-ideo",
        "type": "related"
      },
      {
        "slug": "gof-design-patterns",
        "type": "related"
      }
    ]
  },
  {
    "id": 12,
    "name": "Human-AI Interaction Design (HAI)",
    "name_zh": "人机交互设计（AI时代）",
    "slug": "human-ai-interaction-design",
    "category": "thinking",
    "desc": "Design AI-augmented workflows balancing autonomy and control",
    "desc_zh": "设计平衡自主性与控制权的AI增强型工作流程",
    "steps": [
      "Define the Human-AI Task Split: identify which subtasks the AI handles autonomously, which require human oversight, and which are collaborative",
      "Design Explainability Touchpoints: specify where and how the AI must surface its reasoning, confidence, and uncertainty to users",
      "Model Trust Calibration: design mechanisms for users to build appropriate trust — neither over-relying on nor under-using AI output",
      "Handle AI Failure Modes: design graceful degradation paths for hallucination, low-confidence output, and distribution shift scenarios",
      "Iterate with Human-in-the-Loop Feedback: build feedback collection into the UX to continuously improve AI behavior from real usage"
    ],
    "steps_zh": [
      "定义人机任务分工：识别哪些子任务由AI自主处理、哪些需要人类监督、哪些是协作完成",
      "设计可解释性触点：规定AI在何处以何种方式向用户呈现其推理过程、置信度和不确定性",
      "建模信任校准：设计帮助用户建立适度信任的机制——既不过度依赖也不低估AI输出",
      "处理AI失效模式：为幻觉、低置信度输出和分布偏移场景设计优雅降级路径",
      "通过人在环路反馈迭代：将反馈收集内嵌于用户体验，从真实使用中持续改善AI行为"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "User Intent",
      "AI Output",
      "Feedback",
      "Trust",
      "Control"
    ],
    "viz_labels_zh": [
      "用户意图",
      "AI输出",
      "用户反馈",
      "信任建立",
      "人工控制"
    ],
    "related": [
      "human-in-the-loop",
      "ai-pair-programming",
      "responsible-ai-design"
    ],
    "tags": [
      "human-ai",
      "trust",
      "explainability",
      "interaction-design",
      "autonomy"
    ],
    "origin_author": "Microsoft Research / Saleema Amershi et al., 2019",
    "origin_source": "Guidelines for Human-AI Interaction (Saleema Amershi et al., CHI 2019)",
    "origin_source_zh": "《人机交互指南》（Saleema Amershi等，CHI 2019会议）",
    "complexity": "advanced",
    "when_to_use": [
      "When building a product that uses AI/ML models and you need to design how users interact with AI-generated outputs",
      "When users report they don't trust the AI's suggestions or, conversely, when they blindly follow AI output without verification",
      "When designing AI-assisted workflows in high-stakes domains (healthcare, finance, legal) where human oversight is essential",
      "When transitioning a manual workflow to an AI-augmented one and you need to decide what stays human-controlled vs. what becomes automated"
    ],
    "when_to_use_zh": [
      "当构建使用AI/ML模型的产品，需要设计用户如何与AI生成的输出交互时",
      "当用户报告不信任AI的建议，或相反当他们不加验证地盲目遵循AI输出时",
      "当在高风险领域（医疗、金融、法律）设计AI辅助工作流，人类监督至关重要时",
      "当将手动工作流过渡为AI增强型工作流，需要决定什么保持人工控制与什么变为自动化时"
    ],
    "core_concepts": [
      "Levels of Automation: A spectrum from full human control to full AI autonomy, with the optimal level depending on task risk, AI reliability, and user expertise",
      "Trust Calibration: Designing interactions so users develop appropriately calibrated trust — trusting AI when it's reliable and questioning it when it's uncertain",
      "Explainability by Design: Building explanations of AI reasoning into the interface as a first-class design element, not an afterthought",
      "Graceful Degradation: Ensuring the system remains useful and safe when the AI component fails, hallucinates, or encounters out-of-distribution inputs"
    ],
    "core_concepts_zh": [
      "自动化层级：从完全人工控制到完全AI自主的光谱，最优层级取决于任务风险、AI可靠性和用户专业水平",
      "信任校准：设计交互使用户发展出适当校准的信任——在AI可靠时信任它，在AI不确定时质疑它",
      "设计优先的可解释性：将AI推理的解释作为一等设计元素构建到界面中，而非事后补充",
      "优雅降级：确保当AI组件失败、产生幻觉或遇到分布外输入时，系统仍然有用且安全"
    ],
    "timeline": [
      [
        "1990",
        "Ben Shneiderman and other HCI pioneers establish foundational principles for human-computer interaction that inform later human-AI design thinking"
      ],
      [
        "2016",
        "Google publishes the PAIR (People + AI Research) initiative guidelines for designing human-AI experiences, formalizing the field"
      ],
      [
        "2019",
        "Saleema Amershi et al. publish '18 Guidelines for Human-AI Interaction' at CHI 2019, providing the most cited practical design framework for AI products"
      ],
      [
        "2020",
        "Apple's Human Interface Guidelines and Google's Material Design add AI-specific interaction patterns, mainstreaming HAI principles in platform design systems"
      ],
      [
        "2023",
        "The explosion of LLM-based products (ChatGPT, Copilot, Midjourney) creates urgent practical demand for HAI patterns around conversational AI, code generation, and creative tools"
      ]
    ],
    "timeline_zh": [
      [
        "1990",
        "Ben Shneiderman和其他HCI先驱建立人机交互基础原则，为后来的人机AI设计思维奠定基础"
      ],
      [
        "2016",
        "Google发布PAIR（People + AI Research）倡议指南，用于设计人机AI体验，正式化该领域"
      ],
      [
        "2019",
        "Saleema Amershi等在CHI 2019发表「人机交互的18条指南」，提供了AI产品最被广泛引用的实用设计框架"
      ],
      [
        "2020",
        "Apple的人机界面指南和Google的Material Design添加AI特定交互模式，使HAI原则在平台设计系统中主流化"
      ],
      [
        "2023",
        "基于LLM的产品爆发（ChatGPT、Copilot、Midjourney）为对话式AI、代码生成和创意工具的HAI模式创造了紧迫的实际需求"
      ]
    ],
    "dos": [
      "Do show AI confidence levels when presenting suggestions, because users need calibration signals to know when to trust vs. verify AI output",
      "Do design clear 'escape hatches' that let users override or ignore AI suggestions easily, because user agency is essential for trust and for handling AI errors",
      "Do design for AI failure from day one, because AI systems will hallucinate, and users need to be able to recognize, report, and recover from incorrect outputs",
      "Do collect user feedback on AI outputs (thumbs up/down, corrections, overrides) and feed it back into model improvement, because real-world usage patterns are the best training signal",
      "Do communicate AI limitations transparently, because users who understand what the AI can and cannot do develop more appropriate usage patterns"
    ],
    "dos_zh": [
      "在呈现建议时显示AI置信度，因为用户需要校准信号来判断何时信任、何时验证AI输出",
      "设计清晰的「逃生通道」让用户能轻松覆盖或忽略AI建议，因为用户主体性对信任和处理AI错误至关重要",
      "从第一天起就为AI失败进行设计，因为AI系统会产生幻觉，用户需要能够识别、报告和从错误输出中恢复",
      "收集用户对AI输出的反馈（点赞/踩、修正、覆盖）并反馈到模型改进中，因为真实使用模式是最好的训练信号",
      "透明地传达AI的局限性，因为理解AI能做什么和不能做什么的用户会发展出更适当的使用模式"
    ],
    "donts": [
      "Don't present AI output with the same visual authority as human-verified information, because users need visual cues to distinguish AI-generated content from established facts",
      "Don't automate high-stakes decisions without a human review step, because AI errors in critical domains (healthcare, finance) can cause irreversible harm",
      "Don't design AI interactions that anthropomorphize the system to the point of creating false trust, because users who believe they're talking to a sentient entity won't appropriately question outputs",
      "Don't ignore the cold-start problem, because new users have no calibration for the AI's reliability and need guided onboarding to build appropriate trust"
    ],
    "donts_zh": [
      "不要以与人工验证信息相同的视觉权威性呈现AI输出，因为用户需要视觉线索来区分AI生成内容和已确立的事实",
      "不要在没有人工审核步骤的情况下自动化高风险决策，因为关键领域（医疗、金融）中的AI错误可能造成不可逆的伤害",
      "不要设计将系统拟人化到产生虚假信任的AI交互，因为相信自己在与有感知实体对话的用户不会适当质疑输出",
      "不要忽视冷启动问题，因为新用户对AI的可靠性没有校准，需要引导式入门来建立适当的信任"
    ],
    "case_study_company": "GitHub (Copilot)",
    "case_study": "GitHub Copilot applied HAI principles to design an AI pair programming experience that developers would actually trust. Rather than autonomously writing code, Copilot presents suggestions as ghost text that developers explicitly accept, modify, or dismiss — maintaining human agency. The team designed confidence-calibrating interactions: suggestions appear inline (low friction for likely-correct completions) while more uncertain code blocks require explicit tab-acceptance. After launch, GitHub found that developers accepted about 30% of suggestions, indicating healthy trust calibration rather than blind acceptance.",
    "case_study_zh": "GitHub Copilot应用HAI原则来设计开发者会真正信任的AI配对编程体验。Copilot不是自主编写代码，而是将建议以幽灵文本形式呈现，开发者明确接受、修改或忽略——保持了人类主体性。团队设计了信任校准交互：建议以内联方式出现（对可能正确的补全低摩擦），而更不确定的代码块需要显式的Tab接受。上线后，GitHub发现开发者接受了约30%的建议，表明健康的信任校准而非盲目接受。",
    "when_not_to_use": [
      "When the system has no AI/ML component and interactions are purely deterministic, making AI-specific design patterns unnecessary",
      "When the AI operates entirely in the backend with no user-facing decisions or outputs that need trust calibration",
      "When building an internal tool for AI/ML engineers who already understand model limitations and don't need guided trust calibration",
      "When the AI model's accuracy is so high and consequences so low that the overhead of explainability and human review isn't justified"
    ],
    "when_not_to_use_zh": [
      "当系统没有AI/ML组件，交互是纯确定性的，不需要AI特定设计模式时",
      "当AI完全在后端运行，没有需要信任校准的面向用户的决策或输出时",
      "当为已经理解模型局限性的AI/ML工程师构建内部工具，不需要引导式信任校准时",
      "当AI模型精度极高且后果很低，可解释性和人工审核的开销不合理时"
    ],
    "adopters": [
      "GitHub (Copilot)",
      "Google (Search AI Overviews, Gemini)",
      "Microsoft (Cortana, Copilot for M365)",
      "Apple (Siri, Apple Intelligence)",
      "Adobe (Firefly)",
      "Notion (Notion AI)"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "usability",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Saleema Amershi et al. (2019). \"Guidelines for Human-AI Interaction\". ACM CHI 2019.",
    "secondary_sources": [
      "Ben Shneiderman (2022). \"Human-Centered AI\". Oxford University Press.",
      "Google PAIR (2019). \"People + AI Guidebook\". pair.withgoogle.com."
    ],
    "typed_relations": [
      {
        "slug": "human-in-the-loop",
        "type": "complement"
      },
      {
        "slug": "ai-pair-programming",
        "type": "related"
      },
      {
        "slug": "responsible-ai-design",
        "type": "related"
      }
    ]
  },
  {
    "id": 13,
    "name": "Agent-Oriented Design Thinking",
    "name_zh": "智能体导向设计思维",
    "slug": "agent-oriented-design-thinking",
    "category": "thinking",
    "desc": "Design multi-agent systems around roles, goals, and environments",
    "desc_zh": "围绕角色、目标与环境设计多智能体系统的思维框架",
    "steps": [
      "Define Agent Roles and Goals: specify each agent's purpose, objectives, and the scope of its autonomous decision-making authority",
      "Model the Environment: describe the state space agents perceive, including data sources, APIs, tools, and world state representations",
      "Design Agent Communication Protocols: define how agents share observations, delegate tasks, negotiate, and resolve conflicts",
      "Plan for Emergent Behavior: identify potential unintended interactions between agents and design containment or correction mechanisms",
      "Establish Evaluation Criteria: define measurable success metrics for individual agent behavior and overall system outcomes"
    ],
    "steps_zh": [
      "定义智能体角色与目标：明确每个智能体的用途、目标以及其自主决策权的范围",
      "建模环境：描述智能体感知的状态空间，包括数据源、API、工具和世界状态表示",
      "设计智能体通信协议：定义智能体如何共享观察、委托任务、协商并解决冲突",
      "规划涌现行为：识别智能体间潜在的非预期交互，并设计遏制或修正机制",
      "建立评估标准：定义可量化的成功指标，覆盖个体智能体行为和整体系统结果"
    ],
    "ai_relevant": true,
    "viz_type": "tree",
    "viz_labels": [
      "Goal",
      "Perception",
      "Action",
      "Memory",
      "Coordination"
    ],
    "viz_labels_zh": [
      "目标",
      "感知",
      "行动",
      "记忆",
      "协同"
    ],
    "related": [
      "multi-agent-orchestration-pattern",
      "agent-communication-protocol",
      "react-framework"
    ],
    "tags": [
      "agents",
      "multi-agent",
      "roles",
      "goals",
      "emergent-behavior"
    ],
    "origin_author": "Michael Wooldridge & Nicholas Jennings, 1995; extended by modern AI agent frameworks (LangChain, AutoGen, CrewAI), 2023",
    "origin_source": "Intelligent Agents: Theory and Practice (Michael Wooldridge & Nicholas Jennings, The Knowledge Engineering Review, 1995)",
    "origin_source_zh": "《智能体：理论与实践》（Michael Wooldridge和Nicholas Jennings，《知识工程评论》，1995年）",
    "complexity": "advanced",
    "when_to_use": [
      "When building a system where multiple AI components need to collaborate, delegate tasks, and share state to accomplish a complex goal",
      "When a single monolithic AI model can't handle the full scope of a task and decomposition into specialized agents improves quality",
      "When designing autonomous workflows (customer support, code generation, research) that require different AI capabilities at different stages",
      "When you need to add human checkpoints into an AI pipeline and must clearly define where agents hand off to humans and vice versa"
    ],
    "when_to_use_zh": [
      "当构建多个AI组件需要协作、委托任务和共享状态以完成复杂目标的系统时",
      "当单个整体式AI模型无法处理任务的全部范围，分解为专业化智能体能提高质量时",
      "当设计需要在不同阶段使用不同AI能力的自主工作流（客户支持、代码生成、研究）时",
      "当需要在AI管线中添加人工检查点，必须清晰定义智能体在何处移交给人类及反之时"
    ],
    "core_concepts": [
      "Agent Autonomy: Each agent has defined decision-making authority within its scope, acting on goals rather than following rigid scripts, enabling adaptive behavior",
      "Environment Modeling: Agents perceive and act within an environment (APIs, databases, tools, user interfaces) that must be explicitly modeled to define the agent's capabilities and constraints",
      "Inter-Agent Communication: Structured protocols (message passing, shared blackboards, pub/sub) for how agents share information, request help, and resolve conflicting goals",
      "Emergent Behavior Management: In multi-agent systems, individual agents following simple rules can produce unexpected system-level behavior that must be monitored, contained, and corrected"
    ],
    "core_concepts_zh": [
      "智能体自主性：每个智能体在其范围内有定义的决策权，基于目标行动而非遵循刚性脚本，使自适应行为成为可能",
      "环境建模：智能体在必须被显式建模的环境（API、数据库、工具、用户界面）中感知和行动，以定义智能体的能力和约束",
      "智能体间通信：用于智能体如何共享信息、请求帮助和解决目标冲突的结构化协议（消息传递、共享黑板、发布/订阅）",
      "涌现行为管理：在多智能体系统中，遵循简单规则的个体智能体可能产生需要监控、遏制和修正的意外系统级行为"
    ],
    "timeline": [
      [
        "1995",
        "Wooldridge and Jennings publish 'Intelligent Agents: Theory and Practice', establishing the foundational taxonomy of agent architectures (reactive, deliberative, hybrid)"
      ],
      [
        "2003",
        "The JADE (Java Agent Development Framework) and FIPA standards formalize agent communication languages and interaction protocols for multi-agent systems"
      ],
      [
        "2020",
        "OpenAI's work on tool-using agents and Google's research on language model agents spark renewed interest in agent-oriented design for LLM-based systems"
      ],
      [
        "2023",
        "AutoGen (Microsoft), LangChain Agents, and CrewAI launch, providing practical frameworks for building multi-agent LLM systems and making agent design accessible to application developers"
      ],
      [
        "2024",
        "Anthropic (Claude), OpenAI (Assistants API), and Google (Gemini) release production-grade agent APIs, establishing agent-oriented design as a mainstream software architecture pattern"
      ]
    ],
    "timeline_zh": [
      [
        "1995",
        "Wooldridge和Jennings发表「智能体：理论与实践」，建立智能体架构（反应式、审思式、混合式）的基础分类"
      ],
      [
        "2003",
        "JADE（Java智能体开发框架）和FIPA标准正式化了多智能体系统的智能体通信语言和交互协议"
      ],
      [
        "2020",
        "OpenAI关于工具使用型智能体的工作和Google关于语言模型智能体的研究重新激发了基于LLM系统的智能体导向设计兴趣"
      ],
      [
        "2023",
        "AutoGen（微软）、LangChain Agents和CrewAI发布，提供构建多智能体LLM系统的实用框架，使应用开发者能进行智能体设计"
      ],
      [
        "2024",
        "Anthropic（Claude）、OpenAI（Assistants API）和Google（Gemini）发布生产级智能体API，确立智能体导向设计为主流软件架构模式"
      ]
    ],
    "dos": [
      "Do define explicit boundaries for each agent's autonomy, because unbounded agents make unpredictable decisions that are difficult to debug or control",
      "Do design agent communication to be observable and loggable, because debugging multi-agent systems requires tracing the conversation between agents",
      "Do include kill switches and human escalation paths, because autonomous agents can enter loops or produce harmful outputs that need immediate interruption",
      "Do test multi-agent interactions with adversarial scenarios, because agents will encounter edge cases where their goals conflict or their environment provides unexpected inputs",
      "Do start with fewer agents and add complexity gradually, because premature decomposition into many agents creates coordination overhead that outweighs the benefits"
    ],
    "dos_zh": [
      "为每个智能体的自主性定义明确边界，因为无边界的智能体会做出难以调试或控制的不可预测决策",
      "将智能体通信设计为可观察和可日志记录的，因为调试多智能体系统需要追踪智能体之间的对话",
      "包含紧急停止开关和人工升级路径，因为自主智能体可能进入循环或产生需要立即中断的有害输出",
      "用对抗性场景测试多智能体交互，因为智能体会遇到其目标冲突或环境提供意外输入的边缘情况",
      "从较少的智能体开始并逐步增加复杂性，因为过早分解为多个智能体会产生超过收益的协调开销"
    ],
    "donts": [
      "Don't give agents more autonomy than the task requires, because excessive autonomy increases the blast radius of agent errors and makes the system harder to reason about",
      "Don't design agents without clear failure modes and fallback strategies, because in production, AI agents will encounter situations outside their training distribution",
      "Don't assume agents will cooperate perfectly, because misaligned goals, prompt injection, or unexpected inputs can cause agents to work at cross-purposes",
      "Don't build multi-agent systems when a single well-prompted agent would suffice, because multi-agent orchestration adds latency, cost, and debugging complexity"
    ],
    "donts_zh": [
      "不要给予智能体超过任务所需的自主权，因为过度自主会增加智能体错误的影响范围并使系统更难推理",
      "不要设计没有明确失败模式和回退策略的智能体，因为在生产中AI智能体会遇到其训练分布之外的情况",
      "不要假设智能体会完美合作，因为目标不对齐、提示注入或意外输入可能导致智能体相互矛盾地工作",
      "不要在单个精心提示的智能体就够用时构建多智能体系统，因为多智能体编排增加延迟、成本和调试复杂性"
    ],
    "case_study_company": "Microsoft (AutoGen)",
    "case_study": "Microsoft Research developed AutoGen as an open-source framework for building multi-agent LLM applications, applying agent-oriented design principles to real-world AI workflows. In internal deployments, Microsoft used AutoGen to create agent teams for complex coding tasks: a 'Coder' agent writes code, a 'Reviewer' agent critiques it, and a 'Executor' agent runs tests — mimicking a human development team. This multi-agent approach produced higher-quality code than single-agent generation because each agent's specialized role caught different categories of errors, reducing bugs by 25% compared to single-agent baselines in their benchmarks.",
    "case_study_zh": "微软研究院开发了AutoGen作为构建多智能体LLM应用的开源框架，将智能体导向设计原则应用于实际AI工作流。在内部部署中，微软使用AutoGen为复杂编码任务创建智能体团队：「编码者」智能体编写代码，「审查者」智能体进行评审，「执行者」智能体运行测试——模拟人类开发团队。这种多智能体方法比单智能体生成产出了更高质量的代码，因为每个智能体的专业化角色捕获了不同类别的错误，在其基准测试中相比单智能体基线减少了25%的缺陷。",
    "when_not_to_use": [
      "When a single AI model with good prompting can handle the entire task, because unnecessary agent decomposition adds latency and cost without improving quality",
      "When the task requires deterministic, reproducible outputs and agent autonomy introduces unacceptable variability",
      "When the budget for LLM API calls is constrained, because multi-agent systems multiply token usage across agent conversations",
      "When the team lacks experience operating AI systems in production and the debugging complexity of multi-agent systems would overwhelm them"
    ],
    "when_not_to_use_zh": [
      "当单个AI模型配合良好的提示就能处理整个任务时，因为不必要的智能体分解增加延迟和成本而不改善质量",
      "当任务需要确定性、可重现的输出，智能体自主性引入不可接受的可变性时",
      "当LLM API调用预算受限时，因为多智能体系统在智能体对话中倍增令牌使用",
      "当团队缺乏在生产中运营AI系统的经验，多智能体系统的调试复杂性会让他们不堪重负时"
    ],
    "adopters": [
      "Microsoft (AutoGen, Copilot)",
      "Anthropic (Claude Agent SDK)",
      "OpenAI (Assistants API)",
      "Google DeepMind (Gemini Agents)",
      "LangChain",
      "CrewAI"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Michael Wooldridge and Nicholas Jennings (1995). \"Intelligent Agents: Theory and Practice\". The Knowledge Engineering Review, 10(2).",
    "secondary_sources": [
      "Michael Wooldridge (2009). \"An Introduction to MultiAgent Systems, 2nd Edition\". Wiley.",
      "Stuart Russell and Peter Norvig (2020). \"Artificial Intelligence: A Modern Approach, 4th Edition\". Pearson. Chapter 2: Intelligent Agents."
    ],
    "typed_relations": [
      {
        "slug": "multi-agent-orchestration-pattern",
        "type": "complement"
      },
      {
        "slug": "agent-communication-protocol",
        "type": "related"
      },
      {
        "slug": "react-framework",
        "type": "related"
      }
    ]
  },
  {
    "id": 14,
    "name": "Trade-off Sliders Model",
    "name_zh": "权衡滑块模型",
    "slug": "trade-off-sliders",
    "category": "thinking",
    "desc": "Make design trade-offs explicit by ranking competing qualities",
    "desc_zh": "通过排列相互竞争的质量属性，将设计权衡显式化",
    "steps": [
      "Enumerate Quality Attributes: list all relevant system qualities (performance, security, maintainability, cost, scalability, etc.) for the project",
      "Force-Rank the Attributes: have key stakeholders independently rank qualities from most to least critical, then consolidate into a shared ranking",
      "Set Slider Positions: assign a relative priority score (1-5) to each attribute, explicitly acknowledging what is sacrificed at each end",
      "Apply to Design Decisions: when facing a design choice, evaluate each option against the slider positions to find the best-fit solution",
      "Revisit Periodically: re-run the sliders exercise at major milestones to reflect evolved business priorities and technical context"
    ],
    "steps_zh": [
      "枚举质量属性：列出项目所有相关的系统质量（性能、安全、可维护性、成本、可扩展性等）",
      "强制排序属性：让关键干系人独立排序质量属性（最重要到最次要），再整合为共识排序",
      "设定滑块位置：为每个属性赋予相对优先级分值（1-5），明确标注每端所牺牲的内容",
      "应用于设计决策：面对设计选择时，根据滑块位置评估每个方案以找到最优契合方案",
      "定期重新审视：在重要里程碑重新进行滑块练习，以反映业务优先级和技术背景的演变"
    ],
    "ai_relevant": false,
    "viz_type": "radar",
    "viz_labels": [
      "Performance",
      "Security",
      "Scalability",
      "Cost",
      "Simplicity"
    ],
    "viz_labels_zh": [
      "性能",
      "安全",
      "可扩展",
      "成本",
      "简洁"
    ],
    "related": [
      "atam",
      "qaw",
      "cynefin-framework"
    ],
    "tags": [
      "trade-offs",
      "quality-attributes",
      "prioritization",
      "stakeholder-alignment"
    ],
    "origin_author": "Kent Beck (Extreme Programming) / SEI Architecture Trade-off Analysis Method (ATAM), late 1990s",
    "origin_source": "Extreme Programming Explained (Kent Beck, 1999) and SEI ATAM method documentation (Kazman, Klein, Clements, 2000)",
    "origin_source_zh": "《解析极限编程》（Kent Beck，1999年）及SEI ATAM方法文档（Kazman、Klein、Clements，2000年）",
    "complexity": "beginner",
    "when_to_use": [
      "When architects and product managers keep making inconsistent decisions because there's no shared understanding of which qualities matter most",
      "When a team faces a design choice where improving one quality attribute (e.g., performance) would degrade another (e.g., maintainability) and they need a principled tiebreaker",
      "When onboarding new team members who need to quickly understand why past design decisions prioritized certain attributes over others",
      "When scaling a system and the original implicit trade-offs (favoring development speed over operational reliability, for example) are no longer appropriate"
    ],
    "when_to_use_zh": [
      "当架构师和产品经理持续做出不一致的决策，因为没有对哪些质量最重要的共同理解时",
      "当团队面对改善一个质量属性（如性能）会降低另一个（如可维护性）的设计选择，需要有原则的裁决时",
      "当入职新团队成员，他们需要快速理解为何过往设计决策优先考虑了某些属性而非其他属性时",
      "当扩展系统而原来隐含的权衡（如偏重开发速度而非运营可靠性）不再适用时"
    ],
    "core_concepts": [
      "Explicit Trade-offs: Making quality attribute priorities visible and documented, rather than leaving them as implicit assumptions that different team members interpret differently",
      "Force-Ranking: Requiring stakeholders to rank attributes eliminates the common dysfunction of declaring everything equally important, which is the same as declaring nothing important",
      "Slider Positions as Decision Heuristics: Once slider positions are set, they serve as fast heuristics for everyday design decisions without requiring a full stakeholder meeting each time",
      "Temporal Validity: Trade-off positions are valid for a phase of the project, not forever — what matters in an MVP differs from what matters in a mature production system"
    ],
    "core_concepts_zh": [
      "显式权衡：使质量属性优先级可见并有文档记录，而非留作不同团队成员可能有不同解读的隐含假设",
      "强制排序：要求利益相关者排序属性，消除了声称所有事物同等重要的常见功能障碍——这等同于声称没有事物重要",
      "滑块位置作为决策启发式：一旦设定滑块位置，它们就可以作为日常设计决策的快速启发式，无需每次都召开利益相关者会议",
      "时间有效性：权衡位置对项目的某个阶段有效，而非永远——MVP阶段重要的事物与成熟生产系统重要的事物不同"
    ],
    "timeline": [
      [
        "1996",
        "The Software Engineering Institute (SEI) develops the Architecture Trade-off Analysis Method (ATAM), formalizing quality attribute trade-off analysis for software architecture"
      ],
      [
        "1999",
        "Kent Beck publishes 'Extreme Programming Explained', introducing the metaphor of 'dials' that stakeholders set to express project priorities"
      ],
      [
        "2003",
        "The SEI publishes 'Evaluating Software Architectures' (Clements, Kazman, Klein), providing detailed guidance on quality attribute workshops and trade-off analysis"
      ],
      [
        "2010s",
        "Agile teams widely adopt lightweight slider exercises in sprint planning and architecture decision records (ADRs) to document trade-off reasoning"
      ],
      [
        "2020s",
        "Platform engineering teams use trade-off sliders to navigate build-vs-buy decisions for cloud infrastructure, balancing cost, control, and operational overhead"
      ]
    ],
    "timeline_zh": [
      [
        "1996",
        "软件工程研究所（SEI）开发架构权衡分析方法（ATAM），为软件架构的质量属性权衡分析正式化"
      ],
      [
        "1999",
        "Kent Beck出版《解析极限编程》，引入利益相关者设定以表达项目优先级的「旋钮」比喻"
      ],
      [
        "2003",
        "SEI出版《评估软件架构》（Clements、Kazman、Klein），提供质量属性研讨会和权衡分析的详细指导"
      ],
      [
        "2010年代",
        "敏捷团队广泛在冲刺规划和架构决策记录（ADR）中采用轻量级滑块练习来记录权衡推理"
      ],
      [
        "2020年代",
        "平台工程团队使用权衡滑块来指导云基础设施的自建与外购决策，平衡成本、控制和运营开销"
      ]
    ],
    "dos": [
      "Do force stakeholders to rank rather than rate attributes, because ranking eliminates the 'everything is priority 1' problem that rating systems allow",
      "Do document the reasoning behind slider positions, because future team members need to understand why security was ranked above velocity, not just that it was",
      "Do include both technical and business stakeholders in the ranking exercise, because engineering-only rankings miss business context and business-only rankings miss technical feasibility",
      "Do use the sliders as a living reference in architecture decision records, because linking design decisions back to slider positions makes trade-off reasoning traceable"
    ],
    "dos_zh": [
      "强制利益相关者排序而非评分属性，因为排序消除了评分系统允许的「所有事物都是优先级1」的问题",
      "记录滑块位置背后的推理，因为未来的团队成员需要理解为何安全性被排在速度之上，而不仅仅是知道它被排在上面",
      "在排序练习中同时包含技术和商业利益相关者，因为仅工程排序缺少商业背景，仅商业排序缺少技术可行性",
      "将滑块用作架构决策记录中的活参考，因为将设计决策链接回滑块位置使权衡推理可追溯"
    ],
    "donts": [
      "Don't allow stakeholders to declare all attributes equally important, because in practice resources are finite and trade-offs are inevitable — equal ranking is an abdication of decision-making",
      "Don't set sliders once and forget them, because as the product matures from MVP to scale-up, the relative importance of quality attributes shifts dramatically",
      "Don't use sliders to avoid making hard decisions, because sliders inform decisions but a team still needs to commit to a specific design path based on the priorities",
      "Don't include too many attributes (more than 7-8), because cognitive overload defeats the purpose of having clear, actionable priorities"
    ],
    "donts_zh": [
      "不要允许利益相关者声称所有属性同等重要，因为在实际中资源有限、权衡不可避免——相同排序是对决策的逃避",
      "不要设定一次滑块后就遗忘，因为随着产品从MVP成熟到规模化，质量属性的相对重要性会发生剧变",
      "不要用滑块来逃避艰难决策，因为滑块是为决策提供信息的工具，团队仍需基于优先级承诺特定的设计路径",
      "不要包含过多属性（超过7-8个），因为认知过载会违背拥有清晰、可操作优先级的目的"
    ],
    "case_study_company": "Spotify",
    "case_study": "Spotify's engineering culture explicitly used trade-off slider thinking when building their microservices platform. Teams were given clear organizational priorities: developer velocity ranked above operational efficiency, and autonomy ranked above consistency. This explicit trade-off led to their decision to let squads choose their own technology stacks (prioritizing autonomy and velocity) even at the cost of some cross-team inconsistency. The clarity of these trade-off positions enabled over 250 independent squads to make locally optimal decisions that aligned with organizational strategy.",
    "case_study_zh": "Spotify的工程文化在构建微服务平台时明确使用了权衡滑块思维。团队获得了清晰的组织优先级：开发者速度排在运营效率之上，自主性排在一致性之上。这种显式权衡导致了他们让小队选择自己技术栈的决策（优先考虑自主性和速度），即使以某些跨团队不一致性为代价。这些权衡位置的清晰性使超过250个独立小队能够做出与组织战略一致的局部最优决策。",
    "when_not_to_use": [
      "When the project has a single dominant quality requirement (e.g., regulatory compliance) that overrides all other considerations, making a ranking exercise unnecessary",
      "When the team is so small (1-2 people) that trade-offs are managed through informal conversation and a formal exercise would be overhead",
      "When the project is a short experiment or spike where the investment in quality attribute analysis won't pay off before the work is discarded",
      "When stakeholders are unable or unwilling to commit to rankings and the exercise would produce a meaningless consensus of 'everything is important'"
    ],
    "when_not_to_use_zh": [
      "当项目有单一主导的质量需求（如法规合规）覆盖所有其他考虑，使排序练习不必要时",
      "当团队很小（1-2人），权衡通过非正式对话管理，正式练习是多余的开销时",
      "当项目是短期实验或探针，质量属性分析的投入在工作被抛弃前无法回收时",
      "当利益相关者无法或不愿承诺排序，练习只会产生「所有事物都重要」的无意义共识时"
    ],
    "adopters": [
      "Spotify",
      "Atlassian",
      "ThoughtWorks",
      "Pivotal (now VMware Tanzu)",
      "Netflix",
      "Stripe"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Kent Beck (1999). \"Extreme Programming Explained: Embrace Change\". Addison-Wesley.",
    "secondary_sources": [
      "Rick Kazman, Mark Klein, and Paul Clements (2000). \"ATAM: Method for Architecture Evaluation\". SEI Technical Report CMU/SEI-2000-TR-004.",
      "Barry Boehm and Richard Turner (2003). \"Balancing Agility and Discipline: A Guide for the Perplexed\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "atam",
        "type": "complement"
      },
      {
        "slug": "qaw",
        "type": "complement"
      },
      {
        "slug": "cynefin-framework",
        "type": "complement"
      }
    ]
  },
  {
    "id": 161,
    "name": "Complexity Budget",
    "name_zh": "复杂度预算",
    "slug": "complexity-budget",
    "category": "thinking",
    "desc": "Every module gets a complexity budget; exceed it and you must decompose",
    "desc_zh": "每个模块拥有复杂度预算；超出时必须进行分解",
    "steps": [
      "Inventory Complexity Sources: for each module, enumerate all sources of complexity — interface size, number of special cases, hidden dependencies, and cognitive load required to understand it",
      "Assign Budgets: establish a maximum acceptable complexity threshold for each module based on its role (leaf modules get smaller budgets, orchestration modules get slightly more)",
      "Measure Against Budget: evaluate each module against its budget using proxies such as lines of code per method, cyclomatic complexity, number of parameters, and depth of abstraction layers",
      "Decompose Over-Budget Modules: when a module exceeds its budget, split it by extracting cohesive sub-responsibilities into new deep modules with simple interfaces",
      "Track Budget Over Time: integrate complexity metrics into CI/CD so that budget violations surface as warnings, preventing gradual complexity creep across releases"
    ],
    "steps_zh": [
      "盘点复杂度来源：为每个模块列出所有复杂度来源——接口大小、特殊情况数量、隐藏依赖和理解所需的认知负荷",
      "分配预算：根据模块角色为其建立最大可接受的复杂度阈值（叶子模块获得更小的预算，编排模块稍多）",
      "对照预算衡量：使用每个方法的代码行数、圈复杂度、参数数量和抽象层深度等代理指标评估每个模块",
      "分解超预算模块：当模块超出预算时，将内聚的子职责提取到具有简单接口的新深模块中",
      "持续跟踪预算：将复杂度指标集成到CI/CD中，使预算违规作为警告浮现，防止跨版本的渐进式复杂度蔓延"
    ],
    "ai_relevant": true,
    "viz_type": "pyramid",
    "viz_labels": [
      "Essential Complexity",
      "Accidental Complexity",
      "Budget Limit"
    ],
    "viz_labels_zh": [
      "本质复杂度",
      "偶然复杂度",
      "复杂度预算"
    ],
    "related": [
      "deep-vs-shallow-modules",
      "separation-of-concerns",
      "domain-driven-design"
    ],
    "tags": [
      "complexity",
      "modularity",
      "software-design",
      "decomposition",
      "cognitive-load"
    ],
    "origin_author": "John Ousterhout, 2018",
    "origin_source": "A Philosophy of Software Design (John Ousterhout, 2018)",
    "origin_source_zh": "《软件设计哲学》（John Ousterhout，2018年）",
    "complexity": "intermediate",
    "when_to_use": [
      "When a codebase has grown organically and certain modules have become 'god classes' that no single developer can fully comprehend",
      "When code review discussions repeatedly flag the same modules as hard to understand or risky to change",
      "When onboarding time for new engineers is excessive because key modules require too much context to work with safely",
      "When bug density is concentrated in a few complex modules and you need a principled strategy for simplification"
    ],
    "when_to_use_zh": [
      "当代码库自然增长，某些模块已变成没有任何开发者能完全理解的「上帝类」时",
      "当代码审查讨论反复标记相同模块难以理解或修改风险高时",
      "当新工程师的入职时间过长，因为关键模块需要太多上下文才能安全地工作时",
      "当缺陷密度集中在少数复杂模块中，需要一种有原则的简化策略时"
    ],
    "core_concepts": [
      "Complexity as a Finite Resource: Treating complexity like a budget forces teams to make conscious allocation decisions rather than letting complexity accumulate invisibly",
      "Cognitive Load as the True Cost: The real cost of complexity is not lines of code but the mental effort required for a developer to understand and safely modify a module",
      "Strategic Decomposition: Splitting modules is not about making things smaller for its own sake, but about ensuring each piece can be understood independently",
      "Incremental Complexity Creep: Without budgets, each small addition seems harmless, but the cumulative effect makes modules incomprehensible over time"
    ],
    "core_concepts_zh": [
      "复杂度作为有限资源：将复杂度视为预算迫使团队做出有意识的分配决策，而非让复杂度无形积累",
      "认知负荷作为真实成本：复杂度的真实成本不是代码行数，而是开发者理解和安全修改模块所需的心智努力",
      "战略性分解：拆分模块不是为了缩小而缩小，而是为了确保每个部分都能被独立理解",
      "渐进式复杂度蔓延：没有预算时，每次小改动看似无害，但累积效应会使模块随时间变得难以理解"
    ],
    "timeline": [
      [
        "2018",
        "John Ousterhout publishes 'A Philosophy of Software Design', articulating complexity management as the central challenge of software engineering"
      ],
      [
        "2019",
        "Stanford CS 190 adopts the book as required reading, popularizing complexity-centric design thinking among new engineers"
      ],
      [
        "2021",
        "Google and Stripe engineering blogs cite complexity budgets in their module design guidelines for large-scale systems"
      ],
      [
        "2023",
        "AI-assisted code review tools begin incorporating complexity budget analysis to flag over-budget modules automatically"
      ]
    ],
    "timeline_zh": [
      [
        "2018",
        "John Ousterhout出版《软件设计哲学》，将复杂度管理阐述为软件工程的核心挑战"
      ],
      [
        "2019",
        "斯坦福CS 190将该书列为必读教材，在新工程师中普及以复杂度为中心的设计思维"
      ],
      [
        "2021",
        "Google和Stripe工程博客在大规模系统的模块设计指南中引用复杂度预算"
      ],
      [
        "2023",
        "AI辅助代码审查工具开始集成复杂度预算分析，自动标记超预算模块"
      ]
    ],
    "dos": [
      "Do set budgets relative to the module's role, because an orchestration layer legitimately needs more complexity than a utility function",
      "Do treat interface complexity and implementation complexity separately, because a simple interface can justifiably hide significant internal complexity",
      "Do revisit budgets when requirements change significantly, because new features may legitimately require reallocating complexity across modules",
      "Do use the budget as a conversation starter in code reviews, not a rigid gate, because judgment about when to decompose requires context"
    ],
    "dos_zh": [
      "根据模块角色设定相对预算，因为编排层合理需要比工具函数更多的复杂度",
      "分别对待接口复杂度和实现复杂度，因为简单接口可以合理地隐藏显著的内部复杂度",
      "在需求显著变化时重新审视预算，因为新功能可能合理地需要在模块间重新分配复杂度",
      "在代码审查中将预算用作对话起点而非刚性门控，因为何时分解的判断需要上下文"
    ],
    "donts": [
      "Don't set a single universal budget for all modules, because different roles in the architecture have legitimately different complexity needs",
      "Don't decompose modules purely based on size metrics, because a long but linear function may be less complex than a short function with deep nesting and hidden state",
      "Don't ignore the complexity cost of decomposition itself, because splitting a module into too many pieces creates coordination complexity that can exceed the original problem",
      "Don't let complexity budgets become a bureaucratic checkbox, because the goal is developer understanding, not metric compliance"
    ],
    "donts_zh": [
      "不要为所有模块设定单一的通用预算，因为架构中不同角色有合理的不同复杂度需求",
      "不要纯粹基于大小指标分解模块，因为一个长但线性的函数可能比一个带有深层嵌套和隐藏状态的短函数复杂度更低",
      "不要忽视分解本身的复杂度成本，因为将模块拆分为过多碎片会产生超过原始问题的协调复杂度",
      "不要让复杂度预算成为官僚化的检查框，因为目标是开发者的理解力，而非指标合规"
    ],
    "case_study_company": "Google",
    "case_study": "Google's internal code health guidelines explicitly track complexity per module as part of their readability review process. When the Google Maps rendering engine grew to over 50,000 lines in a single module, the team applied complexity budgeting to decompose it into 12 focused sub-modules (tile management, label placement, route rendering, etc.), each with a clear interface contract. Post-decomposition, the time to onboard new engineers to the rendering pipeline dropped from 3 months to 6 weeks, and the bug rate in rendering-related changes fell by 40% because developers could reason about each sub-module independently.",
    "case_study_zh": "Google的内部代码健康指南在其可读性审查流程中明确跟踪每个模块的复杂度。当Google Maps渲染引擎在单个模块中增长到超过50,000行时，团队应用复杂度预算将其分解为12个聚焦的子模块（瓦片管理、标签放置、路线渲染等），每个都有清晰的接口契约。分解后，新工程师熟悉渲染管线的时间从3个月缩短到6周，渲染相关变更的缺陷率下降了40%，因为开发者可以独立推理每个子模块。",
    "when_not_to_use": [
      "When the codebase is small enough that every developer can hold the entire system in their head, making formal budgets unnecessary overhead",
      "When you are in an early prototyping phase where rapid experimentation matters more than long-term modularity",
      "When the team is building a throwaway script or short-lived automation where maintenance costs will never materialize",
      "When decomposition would require breaking a performance-critical hot path into multiple modules, introducing unacceptable latency"
    ],
    "when_not_to_use_zh": [
      "当代码库足够小，每个开发者都能在脑中掌握整个系统，使正式预算成为不必要的开销时",
      "当处于早期原型阶段，快速实验比长期模块化更重要时",
      "当团队构建的是一次性脚本或短期自动化，维护成本永远不会产生时",
      "当分解需要将性能关键的热路径拆分为多个模块，引入不可接受的延迟时"
    ],
    "adopters": [
      "Google",
      "Stripe",
      "Stanford CS 190",
      "JetBrains",
      "Microsoft"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "John Ousterhout (2018). \"A Philosophy of Software Design\". Yaknyam Press.",
    "secondary_sources": [
      "Fred Brooks (1986). \"No Silver Bullet: Essence and Accidents of Software Engineering\". IEEE Computer, 19(4).",
      "Rich Hickey (2012). \"Simple Made Easy\". Strange Loop Conference, 2011."
    ],
    "typed_relations": [
      {
        "slug": "deep-vs-shallow-modules",
        "type": "complement"
      },
      {
        "slug": "separation-of-concerns",
        "type": "complement"
      },
      {
        "slug": "domain-driven-design",
        "type": "prerequisite"
      }
    ]
  },
  {
    "id": 162,
    "name": "Deep vs Shallow Modules",
    "name_zh": "深模块与浅模块",
    "slug": "deep-vs-shallow-modules",
    "category": "thinking",
    "desc": "Prefer deep modules (simple interface, complex implementation) over shallow ones (complex interface, simple implementation)",
    "desc_zh": "优先选择深模块（简单接口、复杂实现）而非浅模块（复杂接口、简单实现）",
    "steps": [
      "Identify Module Boundaries: map the system's current module structure and catalog each module's public interface (methods, parameters, return types, exceptions)",
      "Measure Depth Ratio: for each module, compare the interface complexity (number of public methods, parameters, configuration options) against the implementation complexity it hides",
      "Flag Shallow Modules: identify modules where the interface is nearly as complex as the implementation — these are shallow modules that push complexity onto their callers",
      "Redesign for Depth: merge shallow modules or expand their scope so that each module's simple interface hides significant implementation complexity, providing more value per abstraction",
      "Validate with Caller Simplicity: confirm that callers of the redesigned modules are simpler than before — if caller code hasn't gotten simpler, the deepening hasn't achieved its purpose"
    ],
    "steps_zh": [
      "识别模块边界：映射系统当前的模块结构，编目每个模块的公共接口（方法、参数、返回类型、异常）",
      "衡量深度比：为每个模块比较接口复杂度（公共方法数量、参数、配置选项）与其隐藏的实现复杂度",
      "标记浅模块：识别接口几乎与实现一样复杂的模块——这些浅模块将复杂度推给了调用者",
      "为深度重新设计：合并浅模块或扩展其范围，使每个模块的简单接口隐藏显著的实现复杂度，提供更高的抽象价值",
      "用调用者简化来验证：确认重新设计后模块的调用者比之前更简单——如果调用者代码没有变简单，深化就没有达到目的"
    ],
    "ai_relevant": true,
    "viz_type": "matrix",
    "viz_labels": [
      "Deep Module",
      "Shallow Module",
      "Interface Width",
      "Implementation Depth"
    ],
    "viz_labels_zh": [
      "深模块",
      "浅模块",
      "接口宽度",
      "实现深度"
    ],
    "related": [
      "complexity-budget",
      "leaky-abstractions",
      "separation-of-concerns",
      "design-by-contract"
    ],
    "tags": [
      "abstraction",
      "interface-design",
      "modularity",
      "software-architecture",
      "information-hiding"
    ],
    "origin_author": "John Ousterhout, 2018",
    "origin_source": "A Philosophy of Software Design (John Ousterhout, 2018)",
    "origin_source_zh": "《软件设计哲学》（John Ousterhout，2018年）",
    "complexity": "intermediate",
    "when_to_use": [
      "When your codebase has many small classes or functions that each do very little, forcing callers to orchestrate numerous tiny pieces",
      "When developers complain that using an internal API requires understanding too many parameters and configuration options",
      "When you notice that module interfaces mirror their implementations, providing no real abstraction benefit",
      "When refactoring a system and deciding how to draw module boundaries for maximum developer productivity"
    ],
    "when_to_use_zh": [
      "当代码库有许多小类或函数，每个做的事很少，迫使调用者编排大量微小组件时",
      "当开发者抱怨使用内部API需要理解太多参数和配置选项时",
      "当你注意到模块接口镜像其实现，没有提供真正的抽象收益时",
      "当重构系统并决定如何划分模块边界以最大化开发者生产力时"
    ],
    "core_concepts": [
      "Information Hiding: The primary purpose of a module is to hide implementation details behind a simple interface, reducing the knowledge callers need",
      "Interface-to-Implementation Ratio: A deep module provides a high ratio of hidden complexity to exposed interface, giving callers maximum benefit per abstraction",
      "Shallow Module Anti-Pattern: Modules that expose nearly as much complexity as they hide add abstraction layers without reducing overall system complexity",
      "Default-Rich Interfaces: Deep modules provide sensible defaults so that the common case requires minimal configuration, while still allowing advanced customization"
    ],
    "core_concepts_zh": [
      "信息隐藏：模块的首要目的是在简单接口背后隐藏实现细节，减少调用者所需的知识",
      "接口与实现比：深模块提供隐藏复杂度与暴露接口的高比率，为调用者带来最大的抽象收益",
      "浅模块反模式：暴露的复杂度与隐藏的几乎一样多的模块增加了抽象层但未减少整体系统复杂度",
      "默认值丰富的接口：深模块提供合理的默认值，使常见场景只需最小配置，同时仍允许高级定制"
    ],
    "timeline": [
      [
        "1972",
        "David Parnas publishes 'On the Criteria To Be Used in Decomposing Systems into Modules', establishing information hiding as the foundation of modular design"
      ],
      [
        "2018",
        "John Ousterhout publishes 'A Philosophy of Software Design', coining the deep vs shallow module distinction and making it a central design principle"
      ],
      [
        "2020",
        "The concept gains traction in the Go community, where the standard library is frequently cited as an exemplar of deep module design"
      ],
      [
        "2023",
        "AI code generation tools begin evaluating generated code for module depth, preferring to produce deep modules with minimal interface surface"
      ]
    ],
    "timeline_zh": [
      [
        "1972",
        "David Parnas发表《系统分解为模块的准则》，确立信息隐藏作为模块化设计的基础"
      ],
      [
        "2018",
        "John Ousterhout出版《软件设计哲学》，创造深模块与浅模块的区分，将其作为核心设计原则"
      ],
      [
        "2020",
        "该概念在Go社区获得广泛关注，Go标准库被频繁引用为深模块设计的典范"
      ],
      [
        "2023",
        "AI代码生成工具开始评估生成代码的模块深度，倾向于生成具有最小接口面的深模块"
      ]
    ],
    "dos": [
      "Do design interfaces around common use cases, providing sensible defaults that make the simple case trivial and the complex case possible",
      "Do measure module depth by asking 'how much does a caller need to know to use this correctly?' — less is deeper",
      "Do consider merging clusters of shallow modules into a single deep module when they are always used together",
      "Do look at the Unix file I/O API (open, read, write, close) as the canonical example of deep module design"
    ],
    "dos_zh": [
      "围绕常见用例设计接口，提供合理的默认值使简单场景极简而复杂场景可行",
      "通过问「调用者需要知道多少才能正确使用它？」来衡量模块深度——越少越深",
      "当一组浅模块总是一起使用时，考虑将它们合并为一个深模块",
      "以Unix文件I/O API（open、read、write、close）作为深模块设计的经典范例"
    ],
    "donts": [
      "Don't equate small classes with good design, because many tiny classes with pass-through methods create shallow module sprawl that increases total system complexity",
      "Don't create wrapper classes that merely forward calls to an underlying implementation, because they add an abstraction layer that hides nothing",
      "Don't expose implementation details through your interface (e.g., requiring callers to manage internal state), because that defeats the purpose of having a module boundary",
      "Don't over-parameterize interfaces to handle every possible case, because a 15-parameter constructor is a shallow interface that pushes complexity onto every caller"
    ],
    "donts_zh": [
      "不要将小类等同于好设计，因为许多只有透传方法的微小类造成浅模块蔓延，增加整体系统复杂度",
      "不要创建仅仅转发调用到底层实现的包装类，因为它们增加了不隐藏任何东西的抽象层",
      "不要通过接口暴露实现细节（如要求调用者管理内部状态），因为这违背了模块边界的目的",
      "不要为处理每种可能情况而过度参数化接口，因为15参数构造函数是将复杂度推给每个调用者的浅接口"
    ],
    "case_study_company": "Unix/POSIX",
    "case_study": "The Unix file system API is the textbook example of deep module design. The interface consists of just five calls — open(), read(), write(), close(), and lseek() — yet it hides enormous implementation complexity: file system types (ext4, NFS, ZFS), block allocation, caching, journaling, permissions, locking, and device driver interactions. A developer writing a program that reads a file needs to know almost nothing about these internals. This deep design has remained stable for over 50 years and has been implemented across hundreds of operating systems, proving that a well-designed deep interface can absorb decades of implementation evolution without changing its surface.",
    "case_study_zh": "Unix文件系统API是深模块设计的教科书范例。接口仅由五个调用组成——open()、read()、write()、close()和lseek()——却隐藏了巨大的实现复杂度：文件系统类型（ext4、NFS、ZFS）、块分配、缓存、日志、权限、锁定和设备驱动交互。编写读取文件的程序的开发者几乎不需要了解这些内部细节。这种深层设计已稳定了超过50年，并在数百个操作系统上实现，证明了设计良好的深接口可以吸收数十年的实现演进而不改变其表面。",
    "when_not_to_use": [
      "When building a thin adapter or integration layer whose sole purpose is to translate between two APIs, where the module is inherently shallow by design",
      "When performance requirements demand that callers have fine-grained control over implementation behavior, making a deep abstraction a bottleneck",
      "When the team is building a DSL or configuration system where expressiveness at the interface is the primary value proposition",
      "When prototyping rapidly and the overhead of designing deep interfaces would slow down exploration disproportionately"
    ],
    "when_not_to_use_zh": [
      "当构建薄适配器或集成层，其唯一目的是在两个API之间转换，模块设计上就是浅的时",
      "当性能要求调用者对实现行为有细粒度控制，使深层抽象成为瓶颈时",
      "当团队构建DSL或配置系统，接口的表达力是首要价值主张时",
      "当快速原型设计时，设计深接口的开销会不成比例地减慢探索速度时"
    ],
    "adopters": [
      "Unix/Linux Kernel",
      "Go Standard Library",
      "Java Collections Framework",
      "Redis",
      "SQLite"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "John Ousterhout (2018). \"A Philosophy of Software Design\". Yaknyam Press.",
    "secondary_sources": [
      "David Parnas (1972). \"On the Criteria to Be Used in Decomposing Systems into Modules\". Communications of the ACM, 15(12).",
      "Robert C. Martin (2017). \"Clean Architecture: A Craftsman's Guide to Software Structure and Design\". Prentice Hall."
    ],
    "typed_relations": [
      {
        "slug": "complexity-budget",
        "type": "complement"
      },
      {
        "slug": "leaky-abstractions",
        "type": "related"
      },
      {
        "slug": "separation-of-concerns",
        "type": "related"
      },
      {
        "slug": "design-by-contract",
        "type": "related"
      }
    ]
  },
  {
    "id": 163,
    "name": "Bounded Rationality in Design",
    "name_zh": "设计中的有限理性",
    "slug": "bounded-rationality-in-design",
    "category": "thinking",
    "desc": "Designers satisfice rather than optimize; design for human cognitive limits rather than ideal rationality",
    "desc_zh": "设计者满意即可而非追求最优；为人类认知极限而非理想理性而设计",
    "steps": [
      "Map Cognitive Constraints: identify the cognitive limits users face in your context — working memory capacity (~4 chunks), attention span, decision fatigue, and expertise level",
      "Identify Satisficing Points: determine where users will 'satisfice' (choose the first good-enough option) rather than exhaustively evaluate all alternatives",
      "Reduce Choice Architecture Complexity: restructure choices to present fewer, more distinct options with clear differentiators, respecting the limits of comparative evaluation",
      "Design for Recognition over Recall: use visible cues, defaults, and progressive disclosure so users can recognize correct actions rather than having to remember them",
      "Test with Cognitive Load: validate designs under realistic cognitive load conditions (multitasking, time pressure, interruptions) rather than in ideal laboratory settings"
    ],
    "steps_zh": [
      "映射认知约束：识别用户在特定场景中面对的认知极限——工作记忆容量（约4个组块）、注意力持续时间、决策疲劳和专业水平",
      "识别满意点：确定用户会在哪里「满意即可」（选择第一个足够好的选项）而非穷举评估所有备选方案",
      "降低选择架构复杂度：重构选择以呈现更少、更有区分度的选项和清晰的差异化因素，尊重比较评估的极限",
      "为识别而非回忆而设计：使用可见线索、默认值和渐进式展示，使用户能识别正确操作而非需要记住它们",
      "在认知负荷下测试：在真实认知负荷条件（多任务、时间压力、中断）下验证设计，而非在理想实验室环境中"
    ],
    "ai_relevant": true,
    "viz_type": "pyramid",
    "viz_labels": [
      "Cognitive Limit",
      "Satisficing",
      "Heuristic",
      "Decision"
    ],
    "viz_labels_zh": [
      "认知局限",
      "满意决策",
      "启发规则",
      "决策结果"
    ],
    "related": [
      "design-thinking-ideo",
      "six-thinking-hats",
      "trade-off-sliders"
    ],
    "tags": [
      "cognitive-science",
      "decision-making",
      "user-experience",
      "satisficing",
      "choice-architecture"
    ],
    "origin_author": "Herbert A. Simon, 1947/1996",
    "origin_source": "Administrative Behavior (Herbert Simon, 1947; 4th ed. 1996) and The Sciences of the Artificial (1969)",
    "origin_source_zh": "《管理行为》（Herbert Simon，1947年；第四版1996年）和《人工科学》（1969年）",
    "complexity": "intermediate",
    "when_to_use": [
      "When users consistently make suboptimal choices in your product because they are overwhelmed by options or information density",
      "When designing interfaces for high-stress or time-critical environments (medical, aviation, emergency response) where cognitive overload causes errors",
      "When A/B tests show that adding more features or options decreases rather than increases user satisfaction or task completion",
      "When designing AI-assisted decision tools where the system needs to present recommendations within human cognitive processing limits"
    ],
    "when_to_use_zh": [
      "当用户在产品中持续做出次优选择，因为被选项或信息密度所淹没时",
      "当为高压力或时间关键环境（医疗、航空、应急响应）设计界面，认知过载导致错误时",
      "当A/B测试显示添加更多功能或选项反而降低了用户满意度或任务完成率时",
      "当设计AI辅助决策工具，系统需要在人类认知处理极限内呈现建议时"
    ],
    "core_concepts": [
      "Satisficing: People choose the first option that meets their minimum threshold rather than evaluating all options to find the optimum — design should support this behavior, not fight it",
      "Cognitive Bandwidth: Working memory can hold roughly 4 items simultaneously; any interface that demands more simultaneous considerations will degrade decision quality",
      "Recognition over Recall: Humans are far better at recognizing correct options when presented than recalling them from memory — interfaces should show rather than demand",
      "Decision Fatigue: The quality of decisions degrades over a session of repeated choices; high-stakes decisions should be placed early and lower-stakes ones should use smart defaults"
    ],
    "core_concepts_zh": [
      "满意即可：人们选择第一个满足最低阈值的选项而非评估所有选项以找到最优——设计应支持这种行为而非对抗它",
      "认知带宽：工作记忆同时约能容纳4个条目；任何要求更多同时考虑的界面都会降低决策质量",
      "识别优于回忆：人类在呈现选项时识别正确选项的能力远强于从记忆中回忆——界面应展示而非要求",
      "决策疲劳：在重复选择的过程中决策质量会下降；高风险决策应放在前面，低风险决策应使用智能默认值"
    ],
    "timeline": [
      [
        "1947",
        "Herbert Simon introduces bounded rationality in 'Administrative Behavior', challenging the assumption of perfectly rational economic actors"
      ],
      [
        "1956",
        "Simon formalizes satisficing as a decision-making strategy in 'Rational Choice and the Structure of the Environment'"
      ],
      [
        "1978",
        "Simon wins the Nobel Prize in Economics for his pioneering research on decision-making processes within organizations"
      ],
      [
        "2013",
        "Don Norman integrates bounded rationality principles into 'The Design of Everyday Things' (revised edition), connecting cognitive limits to interaction design"
      ]
    ],
    "timeline_zh": [
      [
        "1947",
        "Herbert Simon在《管理行为》中引入有限理性概念，挑战了完全理性经济人的假设"
      ],
      [
        "1956",
        "Simon在《理性选择与环境结构》中将满意即可正式化为决策策略"
      ],
      [
        "1978",
        "Simon因其在组织内决策过程的开创性研究获得诺贝尔经济学奖"
      ],
      [
        "2013",
        "Don Norman在《设计心理学》（修订版）中整合有限理性原则，将认知极限与交互设计联系起来"
      ]
    ],
    "dos": [
      "Do design default options that work well for 80% of users, because most people will accept a good default rather than customizing",
      "Do limit choices to 3-5 options at each decision point, because more options increase decision time and decrease satisfaction (Hick's Law)",
      "Do use progressive disclosure to show advanced options only when needed, because revealing everything at once overwhelms bounded cognition",
      "Do test designs with tired, distracted, and novice users, because they reveal cognitive limit violations that expert testers miss"
    ],
    "dos_zh": [
      "设计对80%用户有效的默认选项，因为大多数人会接受好的默认值而非去定制",
      "将每个决策点的选择限制在3-5个选项，因为更多选项增加决策时间并降低满意度（希克定律）",
      "使用渐进式展示，仅在需要时显示高级选项，因为一次性展示所有内容会淹没有限认知",
      "用疲倦、分心和新手用户测试设计，因为他们能揭示专家测试者遗漏的认知极限违规"
    ],
    "donts": [
      "Don't assume users will read all information before making a decision, because bounded rationality means they will scan, satisfice, and move on",
      "Don't design for the power user by default, because the vast majority of users operate under more severe cognitive constraints than designers anticipate",
      "Don't offer unlimited customization as a substitute for good defaults, because choice overload causes decision paralysis rather than empowerment",
      "Don't rely on user training to overcome poor interface design, because cognitive limits are biological constraints that training cannot eliminate"
    ],
    "donts_zh": [
      "不要假设用户会在做决策前阅读所有信息，因为有限理性意味着他们会浏览、满意即可然后继续",
      "不要默认为高级用户设计，因为绝大多数用户在比设计师预期更严格的认知约束下操作",
      "不要提供无限定制作为良好默认值的替代品，因为选择过载导致决策瘫痪而非赋能",
      "不要依赖用户培训来克服糟糕的界面设计，因为认知极限是培训无法消除的生物约束"
    ],
    "case_study_company": "Amazon",
    "case_study": "Amazon's one-click purchasing is a textbook application of bounded rationality in design. By recognizing that the traditional multi-step checkout process demanded too many sequential decisions (shipping address, payment method, delivery speed, gift wrapping, confirmation), Amazon collapsed the entire flow into a single pre-configured action. This design respects the satisficing behavior of users who have already decided to buy — they want the first good-enough path to completion. The feature, patented in 1999, contributed to a measurable increase in conversion rates because it eliminated the decision fatigue that caused cart abandonment in multi-step flows.",
    "case_study_zh": "亚马逊的一键购买是有限理性在设计中的教科书应用。通过认识到传统多步结账流程要求太多连续决策（收货地址、支付方式、配送速度、礼品包装、确认），亚马逊将整个流程折叠为单一预配置操作。这种设计尊重已决定购买的用户的满意即可行为——他们想要第一条足够好的完成路径。该功能于1999年获得专利，促成了转化率的可测量提升，因为它消除了导致多步流程购物车放弃的决策疲劳。",
    "when_not_to_use": [
      "When the decision has irreversible high-stakes consequences (surgery, legal contracts) where satisficing could lead to catastrophic errors and deliberate analysis is warranted",
      "When users are domain experts who need full control and visibility, such as professional traders or system administrators",
      "When regulatory requirements mandate that all options be presented equally and prominently, preventing choice architecture optimization",
      "When the decision is truly novel and users have no prior reference point to satisfice against, requiring guided exploration instead"
    ],
    "when_not_to_use_zh": [
      "当决策具有不可逆的高风险后果（手术、法律合同），满意即可可能导致灾难性错误，需要审慎分析时",
      "当用户是需要完全控制和可见性的领域专家时，如专业交易员或系统管理员",
      "当法规要求所有选项必须被平等且突出地呈现，阻止选择架构优化时",
      "当决策真正是全新的，用户没有先前参考点来进行满意即可的判断，需要引导式探索时"
    ],
    "adopters": [
      "Amazon",
      "Apple",
      "Google (Material Design)",
      "Stripe (Checkout)",
      "Duolingo"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "usability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Herbert A. Simon (1947). \"Administrative Behavior: A Study of Decision-Making Processes in Administrative Organization\". Macmillan.",
    "secondary_sources": [
      "Herbert A. Simon (1969). \"The Sciences of the Artificial\". MIT Press.",
      "Herbert A. Simon (1996). \"The Sciences of the Artificial, 3rd Edition\". MIT Press."
    ],
    "typed_relations": [
      {
        "slug": "design-thinking-ideo",
        "type": "complement"
      },
      {
        "slug": "six-thinking-hats",
        "type": "related"
      },
      {
        "slug": "trade-off-sliders",
        "type": "related"
      }
    ]
  },
  {
    "id": 164,
    "name": "Separation of Concerns",
    "name_zh": "关注点分离",
    "slug": "separation-of-concerns",
    "category": "thinking",
    "desc": "The foundational principle of modular design: each module should address a single, well-defined concern",
    "desc_zh": "模块化设计的基础原则：每个模块应解决一个定义明确的关注点",
    "steps": [
      "Identify Concerns: enumerate all distinct responsibilities in the system — business logic, data access, presentation, authentication, logging, error handling, etc.",
      "Map Current Coupling: audit the existing codebase to find where multiple concerns are interleaved within single modules, classes, or functions",
      "Define Concern Boundaries: draw clear boundaries between concerns and decide which module owns each concern, creating a responsibility map",
      "Refactor to Isolate: extract interleaved concerns into dedicated modules with well-defined interfaces, using patterns like middleware, dependency injection, or layered architecture",
      "Enforce Boundaries: establish architectural rules (via linting, module dependency analysis, or code review checklists) that prevent cross-concern coupling from creeping back"
    ],
    "steps_zh": [
      "识别关注点：列出系统中所有不同的职责——业务逻辑、数据访问、展示、认证、日志记录、错误处理等",
      "映射当前耦合：审计现有代码库，找出多个关注点在单个模块、类或函数中交织的位置",
      "定义关注点边界：在关注点之间画出清晰的边界，决定哪个模块拥有每个关注点，创建职责映射",
      "重构以隔离：将交织的关注点提取到具有明确接口的专用模块中，使用中间件、依赖注入或分层架构等模式",
      "执行边界：建立架构规则（通过linting、模块依赖分析或代码审查清单），防止跨关注点耦合再次蔓延"
    ],
    "ai_relevant": true,
    "viz_type": "tree",
    "viz_labels": [
      "Concern",
      "Module",
      "Interface",
      "Dependency"
    ],
    "viz_labels_zh": [
      "关注点",
      "模块",
      "接口",
      "依赖关系"
    ],
    "related": [
      "deep-vs-shallow-modules",
      "complexity-budget",
      "domain-driven-design",
      "design-by-contract"
    ],
    "tags": [
      "modularity",
      "architecture",
      "software-design",
      "coupling",
      "cohesion"
    ],
    "origin_author": "Edsger W. Dijkstra, 1974",
    "origin_source": "On the role of scientific thought (Edsger W. Dijkstra, 1974, EWD447)",
    "origin_source_zh": "《论科学思想的角色》（Edsger W. Dijkstra，1974年，EWD447）",
    "complexity": "beginner",
    "when_to_use": [
      "When a single change to business logic requires modifications in database queries, UI templates, and API controllers simultaneously",
      "When different team members cannot work on different features without merge conflicts because concerns are tangled together in shared files",
      "When testing a module requires setting up unrelated infrastructure (e.g., testing business logic requires a running database) because concerns are not isolated",
      "When onboarding engineers struggle to find where a specific type of logic lives because it is scattered across multiple layers"
    ],
    "when_to_use_zh": [
      "当对业务逻辑的单一更改需要同时修改数据库查询、UI模板和API控制器时",
      "当不同团队成员无法在不同功能上工作而不产生合并冲突，因为关注点在共享文件中纠缠在一起时",
      "当测试一个模块需要设置不相关的基础设施（如测试业务逻辑需要运行数据库），因为关注点未被隔离时",
      "当入职工程师难以找到特定类型逻辑的位置，因为它分散在多个层中时"
    ],
    "core_concepts": [
      "Single Responsibility per Module: Each module should have one reason to change — when a concern changes, only the modules responsible for that concern should need modification",
      "Cohesion and Coupling: Separation of concerns maximizes cohesion (related code together) and minimizes coupling (unrelated code apart), the twin goals of good modular design",
      "Concern as an Axis of Change: A 'concern' is best identified by what changes together — if business rules and UI always change for different reasons, they are separate concerns",
      "Layered Independence: Properly separated concerns can evolve independently — you can swap a database layer without touching business logic, or redesign UI without altering domain models"
    ],
    "core_concepts_zh": [
      "每个模块单一职责：每个模块应只有一个变更原因——当一个关注点变化时，只有负责该关注点的模块需要修改",
      "内聚与耦合：关注点分离最大化内聚（相关代码在一起）并最小化耦合（不相关代码分开），这是良好模块化设计的双重目标",
      "关注点作为变化轴：「关注点」最好通过什么一起变化来识别——如果业务规则和UI总是因不同原因而变化，它们是不同的关注点",
      "分层独立性：适当分离的关注点可以独立演进——可以替换数据库层而不触及业务逻辑，或重新设计UI而不更改领域模型"
    ],
    "timeline": [
      [
        "1968",
        "Dijkstra's 'Go To Statement Considered Harmful' introduces structured programming, laying the groundwork for disciplined concern separation"
      ],
      [
        "1974",
        "Dijkstra coins the term 'separation of concerns' in EWD447, framing it as a technique for managing intellectual complexity"
      ],
      [
        "1996",
        "The Gang of Four's design patterns (Strategy, Observer, Decorator) provide reusable templates for separating concerns in object-oriented systems"
      ],
      [
        "2003",
        "Aspect-Oriented Programming (AspectJ) formalizes cross-cutting concern separation, addressing logging, security, and transactions that cut across traditional module boundaries"
      ]
    ],
    "timeline_zh": [
      [
        "1968",
        "Dijkstra的《Go To语句有害论》引入结构化编程，为规范的关注点分离奠定基础"
      ],
      [
        "1974",
        "Dijkstra在EWD447中创造「关注点分离」一词，将其定义为管理智识复杂度的技术"
      ],
      [
        "1996",
        "四人帮的设计模式（策略、观察者、装饰器）为面向对象系统中的关注点分离提供可复用模板"
      ],
      [
        "2003",
        "面向切面编程（AspectJ）将横切关注点分离正式化，解决跨越传统模块边界的日志、安全和事务问题"
      ]
    ],
    "dos": [
      "Do identify concerns by asking 'what changes together and why?' rather than by following rigid architectural templates, because real concerns are domain-specific",
      "Do use dependency injection to wire separated concerns together, because it keeps modules unaware of each other's implementations while allowing collaboration",
      "Do separate cross-cutting concerns (logging, auth, metrics) using middleware or aspect-oriented techniques rather than scattering them through business logic",
      "Do enforce separation through automated dependency analysis in CI, because manual discipline erodes over time as deadlines create pressure to take shortcuts"
    ],
    "dos_zh": [
      "通过问「什么一起变化以及为什么？」来识别关注点，而非遵循刚性架构模板，因为真正的关注点是领域特定的",
      "使用依赖注入将分离的关注点连接在一起，因为它让模块不知道彼此的实现同时允许协作",
      "使用中间件或面向切面技术分离横切关注点（日志、认证、指标），而非将它们散落在业务逻辑中",
      "通过CI中的自动依赖分析来执行分离，因为手动纪律会随着截止日期创造的走捷径压力而逐渐侵蚀"
    ],
    "donts": [
      "Don't separate concerns so aggressively that simple operations require tracing through 10 layers of indirection, because over-separation creates its own complexity",
      "Don't treat every function as a separate concern, because the goal is meaningful separation of distinct responsibilities, not maximum granularity",
      "Don't mix separation of concerns with code organization (file structure), because two concerns can live in the same file if they are logically distinct and small",
      "Don't ignore the cost of the abstractions needed to separate concerns, because interfaces, adapters, and dependency injection add cognitive overhead that must be justified"
    ],
    "donts_zh": [
      "不要过度激进地分离关注点以至于简单操作需要追踪10层间接调用，因为过度分离会产生自身的复杂度",
      "不要将每个函数都视为独立的关注点，因为目标是不同职责的有意义分离，而非最大粒度",
      "不要将关注点分离与代码组织（文件结构）混为一谈，因为如果两个关注点逻辑上不同且较小，它们可以在同一文件中",
      "不要忽视分离关注点所需抽象的成本，因为接口、适配器和依赖注入增加的认知开销必须被证明是合理的"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix's migration from a monolithic Java application to a microservices architecture is a large-scale application of separation of concerns. The original monolith interleaved recommendation logic, streaming protocols, user authentication, billing, and content metadata in tightly coupled modules. By separating each into independent services with clear API boundaries, Netflix enabled independent deployment (over 1,000 production changes per day), independent scaling (the recommendation engine scales differently from the streaming CDN), and independent team ownership. The separation was so thorough that when the recommendation algorithm was completely rewritten in 2016, no other service required any changes.",
    "case_study_zh": "Netflix从单体Java应用到微服务架构的迁移是关注点分离的大规模应用。原始单体在紧耦合的模块中交织了推荐逻辑、流媒体协议、用户认证、计费和内容元数据。通过将每个关注点分离为具有清晰API边界的独立服务，Netflix实现了独立部署（每天超过1,000次生产变更）、独立扩展（推荐引擎与流媒体CDN不同地扩展）和独立的团队所有权。分离如此彻底，以至于当推荐算法在2016年被完全重写时，没有任何其他服务需要任何更改。",
    "when_not_to_use": [
      "When building a small script or utility where the overhead of modular separation exceeds the complexity of the entire program",
      "When concerns are genuinely inseparable in the domain (e.g., real-time physics simulation where rendering and physics must be tightly coupled for performance)",
      "When the team is in a rapid prototyping phase and premature separation would slow down learning — you can always separate later when boundaries become clear",
      "When the system has only one concern, such as a single-purpose data transformation pipeline, where separation would create artificial boundaries"
    ],
    "when_not_to_use_zh": [
      "当构建小型脚本或工具，模块化分离的开销超过整个程序的复杂度时",
      "当关注点在领域中确实不可分离时（如实时物理模拟中渲染和物理必须紧耦合以保证性能）",
      "当团队处于快速原型阶段，过早分离会减慢学习——可以在边界变得清晰后再分离",
      "当系统只有一个关注点时，如单一用途的数据转换管道，分离会创造人为的边界"
    ],
    "adopters": [
      "Netflix",
      "Linux Kernel",
      "React (UI/State separation)",
      "Spring Framework",
      "Django"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Edsger W. Dijkstra (1974). \"On the Role of Scientific Thought\". EWD447.",
    "secondary_sources": [
      "David Parnas (1972). \"On the Criteria to Be Used in Decomposing Systems into Modules\". Communications of the ACM, 15(12).",
      "Gregor Kiczales et al. (1997). \"Aspect-Oriented Programming\". ECOOP 1997, Springer."
    ],
    "typed_relations": [
      {
        "slug": "deep-vs-shallow-modules",
        "type": "complement"
      },
      {
        "slug": "complexity-budget",
        "type": "complement"
      },
      {
        "slug": "domain-driven-design",
        "type": "complement"
      },
      {
        "slug": "design-by-contract",
        "type": "complement"
      }
    ]
  },
  {
    "id": 165,
    "name": "Leaky Abstractions",
    "name_zh": "抽象泄漏",
    "slug": "leaky-abstractions",
    "category": "thinking",
    "desc": "All non-trivial abstractions leak; design systems to handle the inevitable failures of abstraction layers",
    "desc_zh": "所有非平凡抽象都会泄漏；设计系统时应考虑抽象层不可避免的失效",
    "steps": [
      "Identify Abstraction Layers: catalog all major abstractions in your system — ORMs, network protocols, file systems, cloud services, frameworks — and the implementation details they claim to hide",
      "Predict Leak Points: for each abstraction, identify the scenarios where the underlying reality will surface — network latency in RPC frameworks, SQL specifics in ORMs, file system semantics differences across OS platforms",
      "Design Escape Hatches: provide clean mechanisms for users to bypass the abstraction when it leaks, such as raw SQL access through an ORM or custom HTTP handling in a REST framework",
      "Document Known Leaks: explicitly document the cases where the abstraction is known to leak, so users encounter documentation rather than mysterious bugs",
      "Monitor for New Leaks: instrument abstraction boundaries to detect unexpected leak patterns in production (unusual error types, performance anomalies, fallback activations)"
    ],
    "steps_zh": [
      "识别抽象层：编目系统中所有主要抽象——ORM、网络协议、文件系统、云服务、框架——以及它们声称隐藏的实现细节",
      "预测泄漏点：为每个抽象识别底层现实会浮现的场景——RPC框架中的网络延迟、ORM中的SQL特性、跨操作系统平台的文件系统语义差异",
      "设计逃生口：提供清洁的机制让用户在抽象泄漏时绕过它，如通过ORM访问原始SQL或在REST框架中自定义HTTP处理",
      "记录已知泄漏：明确记录抽象已知泄漏的情况，使用户遇到文档而非神秘的错误",
      "监控新泄漏：在抽象边界设置检测，发现生产中意外的泄漏模式（异常错误类型、性能异常、降级激活）"
    ],
    "ai_relevant": true,
    "viz_type": "tree",
    "viz_labels": [
      "Abstraction Layer",
      "Leaked Detail",
      "Caller",
      "Implementation"
    ],
    "viz_labels_zh": [
      "抽象层",
      "泄漏细节",
      "调用方",
      "实现层"
    ],
    "related": [
      "deep-vs-shallow-modules",
      "separation-of-concerns",
      "complexity-budget",
      "design-by-contract"
    ],
    "tags": [
      "abstraction",
      "software-engineering",
      "debugging",
      "system-design",
      "failure-modes"
    ],
    "origin_author": "Joel Spolsky, 2002",
    "origin_source": "The Law of Leaky Abstractions (Joel Spolsky, Joel on Software blog, 2002)",
    "origin_source_zh": "《抽象泄漏法则》（Joel Spolsky，Joel on Software博客，2002年）",
    "complexity": "intermediate",
    "when_to_use": [
      "When adopting a new abstraction layer (ORM, framework, cloud service) and you need to plan for the cases where it won't perfectly hide the underlying system",
      "When debugging production issues that trace back to abstraction layers behaving unexpectedly under edge conditions",
      "When designing an API or framework that will be used by developers who may not understand the underlying implementation",
      "When evaluating whether to build a custom abstraction vs using an existing one, weighing the leak surface of each option"
    ],
    "when_to_use_zh": [
      "当采用新的抽象层（ORM、框架、云服务），需要为其无法完美隐藏底层系统的情况做规划时",
      "当调试追溯到抽象层在边缘条件下行为异常的生产问题时",
      "当设计将被可能不理解底层实现的开发者使用的API或框架时",
      "当评估是构建自定义抽象还是使用现有抽象，权衡每个选项的泄漏面时"
    ],
    "core_concepts": [
      "Law of Leaky Abstractions: All non-trivial abstractions, to some degree, are leaky — the underlying complexity they hide will eventually surface in edge cases, performance characteristics, or error modes",
      "Abstraction Cost Awareness: Abstractions are not free; they trade one kind of complexity (implementation detail) for another (learning the abstraction's model and its failure modes)",
      "Escape Hatch Design: Good abstractions provide clean escape hatches so that when leaks occur, users can drop to a lower level without abandoning the entire abstraction",
      "Two-Level Understanding: Effective use of leaky abstractions requires understanding both the abstraction and the underlying system — you can work at the high level most of the time but must be prepared to reason at the low level"
    ],
    "core_concepts_zh": [
      "抽象泄漏法则：所有非平凡抽象在某种程度上都是泄漏的——它们隐藏的底层复杂度最终会在边缘情况、性能特征或错误模式中浮现",
      "抽象成本意识：抽象不是免费的；它们用一种复杂度（实现细节）换取另一种复杂度（学习抽象的模型及其失效模式）",
      "逃生口设计：好的抽象提供清洁的逃生口，使泄漏发生时用户可以降到更低层级而无需放弃整个抽象",
      "双层理解：有效使用泄漏抽象需要同时理解抽象和底层系统——大多数时候可以在高层工作，但必须准备好在低层推理"
    ],
    "timeline": [
      [
        "1992",
        "Gregor Kiczales introduces the concept of 'open implementation' at Xerox PARC, recognizing that some abstractions need to expose their internals selectively"
      ],
      [
        "2002",
        "Joel Spolsky publishes 'The Law of Leaky Abstractions' on Joel on Software, naming the phenomenon and providing memorable examples (TCP over IP, SQL over disk I/O)"
      ],
      [
        "2006",
        "The rise of ORMs (Hibernate, ActiveRecord) makes leaky abstractions a daily concern for web developers encountering the N+1 query problem and impedance mismatch"
      ],
      [
        "2020s",
        "Cloud abstractions (serverless, managed databases) create new categories of leaks — cold starts, throttling limits, and regional failover behaviors that the 'serverless' abstraction cannot fully hide"
      ]
    ],
    "timeline_zh": [
      [
        "1992",
        "Gregor Kiczales在Xerox PARC引入「开放实现」概念，认识到某些抽象需要选择性地暴露其内部"
      ],
      [
        "2002",
        "Joel Spolsky在Joel on Software上发表《抽象泄漏法则》，命名该现象并提供了令人印象深刻的示例（TCP over IP、SQL over 磁盘I/O）"
      ],
      [
        "2006",
        "ORM的兴起（Hibernate、ActiveRecord）使抽象泄漏成为Web开发者日常关注的问题，他们遇到了N+1查询问题和阻抗失配"
      ],
      [
        "2020年代",
        "云抽象（无服务器、托管数据库）创造了新的泄漏类别——冷启动、限流限制和区域故障转移行为，「无服务器」抽象无法完全隐藏"
      ]
    ],
    "dos": [
      "Do learn the layer below every abstraction you use, because when it leaks (and it will), you need mental models of both levels to debug effectively",
      "Do provide escape hatches in abstractions you build, because users will inevitably encounter cases your abstraction doesn't cover and they need a clean way out",
      "Do document known leak points explicitly in your API documentation, because 'here be dragons' warnings save users hours of confused debugging",
      "Do monitor abstraction boundaries in production for anomalous error rates, because new leak patterns often emerge under load or edge conditions you didn't test"
    ],
    "dos_zh": [
      "学习你使用的每个抽象下面的层，因为当它泄漏时（一定会的），你需要两个层级的心智模型来有效调试",
      "在你构建的抽象中提供逃生口，因为用户不可避免地会遇到你的抽象未覆盖的情况，他们需要一条干净的出路",
      "在API文档中明确记录已知泄漏点，因为「此处有龙」的警告能为用户节省数小时的困惑调试",
      "在生产中监控抽象边界的异常错误率，因为新的泄漏模式常在负载或你未测试的边缘条件下出现"
    ],
    "donts": [
      "Don't treat abstractions as airtight contracts, because the entire point of the law is that they will leak — defensive design anticipates this",
      "Don't add more abstraction layers to fix a leaky abstraction, because stacking leaky abstractions compounds the debugging difficulty exponentially",
      "Don't blame users for needing to understand the underlying system, because requiring two-level understanding is a consequence of abstraction physics, not user failure",
      "Don't build abstractions that hide failure modes, because the most dangerous leaks are the ones that silently corrupt data or degrade performance without visible errors"
    ],
    "donts_zh": [
      "不要将抽象视为密封的契约，因为这个法则的全部要点是它们会泄漏——防御性设计应预见到这一点",
      "不要用更多的抽象层来修复泄漏的抽象，因为堆叠泄漏的抽象会指数级地增加调试难度",
      "不要责怪用户需要理解底层系统，因为需要双层理解是抽象物理学的结果，而非用户的失败",
      "不要构建隐藏失效模式的抽象，因为最危险的泄漏是那些无声地损坏数据或降低性能而没有可见错误的泄漏"
    ],
    "case_study_company": "Heroku",
    "case_study": "Heroku's Platform-as-a-Service abstracted away server management, promising developers they could 'just push code.' But as applications scaled, the abstraction leaked in predictable ways: dyno cycling caused unpredictable request latency, the shared routing layer created noisy-neighbor performance issues, and the filesystem's ephemeral nature surprised developers who expected persistent local storage. Heroku responded well by documenting these leaks explicitly in their architecture documentation and providing escape hatches (dedicated dynos, external storage services, custom buildpacks). Companies that succeeded on Heroku were those whose engineers understood the underlying infrastructure model despite the abstraction, confirming Spolsky's law.",
    "case_study_zh": "Heroku的平台即服务抽象掉了服务器管理，向开发者承诺他们可以「只管推代码」。但随着应用规模扩大，抽象以可预测的方式泄漏：dyno循环导致不可预测的请求延迟，共享路由层造成噪声邻居性能问题，文件系统的临时性质让期望本地持久存储的开发者感到意外。Heroku通过在架构文档中明确记录这些泄漏并提供逃生口（专用dyno、外部存储服务、自定义构建包）做出了良好的回应。在Heroku上成功的公司都是其工程师尽管有抽象但仍理解底层基础设施模型的公司，这证实了Spolsky的法则。",
    "when_not_to_use": [
      "When working with trivial abstractions (simple wrapper functions, type aliases) where the abstraction is thin enough that leaks are inconsequential",
      "When the abstraction maps perfectly to the domain and there is no impedance mismatch (e.g., mathematical abstractions in numerical computing)",
      "When the team controls both sides of the abstraction and can modify it freely, reducing the cost of encountered leaks to near zero",
      "When the project is so short-lived that the probability of encountering an abstraction leak during its lifetime is negligible"
    ],
    "when_not_to_use_zh": [
      "当使用平凡抽象（简单包装函数、类型别名），抽象薄到泄漏无关紧要时",
      "当抽象完美映射到领域且没有阻抗失配时（如数值计算中的数学抽象）",
      "当团队控制抽象的两端并可以自由修改它，将遇到泄漏的成本降低到接近零时",
      "当项目生命周期很短，其存续期间遇到抽象泄漏的概率可以忽略不计时"
    ],
    "adopters": [
      "Heroku",
      "AWS (Lambda/Serverless)",
      "Microsoft (Entity Framework)",
      "Ruby on Rails (ActiveRecord)",
      "Docker"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Joel Spolsky (2002). \"The Law of Leaky Abstractions\". Joel on Software (joelonsoftware.com).",
    "secondary_sources": [
      "John Ousterhout (2018). \"A Philosophy of Software Design\". Yaknyam Press.",
      "David Parnas (1972). \"On the Criteria to Be Used in Decomposing Systems into Modules\". Communications of the ACM, 15(12)."
    ],
    "typed_relations": [
      {
        "slug": "deep-vs-shallow-modules",
        "type": "complement"
      },
      {
        "slug": "separation-of-concerns",
        "type": "prerequisite"
      },
      {
        "slug": "complexity-budget",
        "type": "complement"
      },
      {
        "slug": "design-by-contract",
        "type": "complement"
      }
    ]
  },
  {
    "id": 166,
    "name": "Worse is Better",
    "name_zh": "更差即更好",
    "slug": "worse-is-better",
    "category": "thinking",
    "desc": "Simpler, less correct implementations often win over complex, theoretically correct ones through easier adoption and faster evolution",
    "desc_zh": "更简单但不够正确的实现往往通过更容易的采纳和更快的演进击败复杂的理论正确实现",
    "steps": [
      "Define 'Good Enough': for each feature, explicitly define the minimum correctness and completeness threshold that delivers real user value, distinguishing essential correctness from theoretical perfection",
      "Prioritize Simplicity of Implementation: when facing a design choice between a complete-but-complex solution and a simple-but-limited one, default to simplicity unless the missing completeness causes real harm",
      "Ship and Observe: release the simpler version to real users quickly, using actual usage patterns to determine which missing capabilities genuinely matter vs which were hypothetical requirements",
      "Evolve Incrementally: add completeness and correctness incrementally based on observed need, letting real-world feedback guide investment rather than upfront architectural speculation",
      "Resist Second-System Syndrome: when the simple system succeeds and pressure mounts for a 'proper' redesign, evaluate whether the simple system's limitations actually cause problems or merely offend engineering aesthetics"
    ],
    "steps_zh": [
      "定义「足够好」：为每个功能明确定义提供真实用户价值的最低正确性和完整性阈值，区分本质正确性和理论完美",
      "优先考虑实现简洁性：当面对完整但复杂的方案和简单但有限的方案之间的设计选择时，默认选择简洁性，除非缺失的完整性造成实际伤害",
      "发布并观察：快速将更简单的版本发布给真实用户，用实际使用模式来确定哪些缺失能力真正重要，哪些是假设性需求",
      "渐进式演进：基于观察到的需求逐步增加完整性和正确性，让现实世界的反馈指导投入而非前期架构推测",
      "抵制第二系统综合征：当简单系统成功且「正式」重新设计的压力增大时，评估简单系统的局限性是否真正导致问题还是仅仅冒犯了工程美学"
    ],
    "ai_relevant": true,
    "viz_type": "matrix",
    "viz_labels": [
      "Simplicity",
      "Correctness",
      "Interface",
      "Implementation"
    ],
    "viz_labels_zh": [
      "简洁性",
      "正确性",
      "接口设计",
      "实现设计"
    ],
    "related": [
      "first-principles-thinking",
      "trade-off-sliders",
      "complexity-budget"
    ],
    "tags": [
      "simplicity",
      "pragmatism",
      "software-philosophy",
      "adoption",
      "evolution"
    ],
    "origin_author": "Richard P. Gabriel, 1989",
    "origin_source": "Lisp: Good News, Bad News, How to Win Big (Richard P. Gabriel, 1989)",
    "origin_source_zh": "《Lisp：好消息、坏消息、如何大赢》（Richard P. Gabriel，1989年）",
    "complexity": "intermediate",
    "when_to_use": [
      "When your team is debating whether to ship a pragmatic solution now or invest months in a theoretically superior architecture",
      "When a competitor is gaining market share with an inferior-but-available product while your team perfects a better design",
      "When designing a public API or protocol where simplicity of adoption matters more than covering every edge case in v1",
      "When evaluating open-source tools and the simpler, less feature-complete option has a larger community and faster iteration cycle"
    ],
    "when_to_use_zh": [
      "当团队在辩论是现在发布务实的方案还是投入数月在理论上更优的架构时",
      "当竞争对手用一个劣质但可用的产品在获取市场份额，而你的团队在完善更好的设计时",
      "当设计公共API或协议，采纳的简洁性比在v1中覆盖每个边缘情况更重要时",
      "当评估开源工具，更简单、功能不完整的选项拥有更大的社区和更快的迭代周期时"
    ],
    "core_concepts": [
      "Simplicity as Survival Advantage: Simpler systems are easier to port, easier to learn, and easier to adapt — this adaptability is a stronger evolutionary advantage than theoretical correctness",
      "The Viral Spread of Good-Enough: A system that is 50% as capable but available today will acquire users who then contribute improvements, eventually surpassing a system that is 100% capable but still in development",
      "Interface vs Implementation Simplicity: Gabriel distinguishes two strategies — 'the right thing' (simple interface, complex implementation) vs 'worse is better' (simple implementation, potentially rougher interface). The simpler implementation wins on adoption.",
      "Worse is Better as Market Dynamics: The principle is not about celebrating mediocrity but about recognizing that markets select for adoption speed and iteration velocity, not architectural purity"
    ],
    "core_concepts_zh": [
      "简洁性作为生存优势：更简单的系统更容易移植、学习和适应——这种适应性是比理论正确性更强的演化优势",
      "足够好的病毒式传播：一个能力50%但今天可用的系统会获得用户，用户随后贡献改进，最终超越一个能力100%但仍在开发中的系统",
      "接口简洁性与实现简洁性：Gabriel区分了两种策略——「正确的做法」（简单接口、复杂实现）与「更差即更好」（简单实现、可能粗糙的接口）。更简单的实现在采纳上胜出。",
      "更差即更好作为市场动态：这个原则不是庆祝平庸，而是认识到市场选择采纳速度和迭代速度，而非架构纯洁性"
    ],
    "timeline": [
      [
        "1989",
        "Richard Gabriel presents 'Lisp: Good News, Bad News, How to Win Big' at the European Conference on the Practical Application of Lisp, contrasting MIT/Stanford and New Jersey design philosophies"
      ],
      [
        "1991",
        "The essay circulates widely online, sparking decades of debate between 'the right thing' (Lisp/MIT approach) and 'worse is better' (Unix/C/New Jersey approach)"
      ],
      [
        "2000",
        "Gabriel revisits the concept in 'Worse Is Better Is Worse' and later 'Is Worse Really Better?', acknowledging the tension is unresolvable and context-dependent"
      ],
      [
        "2010s",
        "The lean startup movement and MVP methodology embody worse-is-better principles, validating the idea that shipping imperfect products early beats perfecting products in isolation"
      ]
    ],
    "timeline_zh": [
      [
        "1989",
        "Richard Gabriel在欧洲Lisp实际应用会议上发表《Lisp：好消息、坏消息、如何大赢》，对比MIT/斯坦福和新泽西设计哲学"
      ],
      [
        "1991",
        "该文在网上广泛传播，引发了「正确的做法」（Lisp/MIT方法）和「更差即更好」（Unix/C/新泽西方法）之间数十年的辩论"
      ],
      [
        "2000",
        "Gabriel在《更差即更好即更差》和后来的《更差真的更好吗？》中重新审视该概念，承认这种张力是不可解的且依赖上下文"
      ],
      [
        "2010年代",
        "精益创业运动和MVP方法论体现了更差即更好的原则，验证了尽早发布不完美产品胜过在隔离中完善产品的理念"
      ]
    ],
    "dos": [
      "Do explicitly define what 'good enough' means for each release, because without a threshold, worse-is-better becomes an excuse for carelessness",
      "Do ship the simplest version that delivers genuine value, then iterate based on real user feedback rather than hypothetical requirements",
      "Do preserve simplicity of the core implementation even as features are added, because the simplicity is what enabled the system's success in the first place",
      "Do study Unix, C, HTTP, and JSON as exemplars of worse-is-better systems that won through simplicity and adaptability"
    ],
    "dos_zh": [
      "为每个版本明确定义「足够好」的含义，因为没有阈值，更差即更好就变成了粗心的借口",
      "发布提供真实价值的最简版本，然后基于真实用户反馈而非假设性需求进行迭代",
      "即使添加功能也要保持核心实现的简洁性，因为简洁性是系统首先成功的原因",
      "研究Unix、C、HTTP和JSON作为通过简洁性和适应性获胜的更差即更好系统的范例"
    ],
    "donts": [
      "Don't use worse-is-better as justification for shipping broken or harmful software, because the principle is about simplicity trade-offs, not quality abandonment",
      "Don't apply worse-is-better to safety-critical systems (medical devices, aviation, financial infrastructure) where 'good enough' correctness can cause catastrophic harm",
      "Don't ignore technical debt that accumulates in worse-is-better systems, because the simplicity advantage erodes if the codebase becomes unmaintainable over time",
      "Don't conflate worse-is-better with anti-intellectualism, because the principle requires deep understanding of trade-offs to apply correctly"
    ],
    "donts_zh": [
      "不要用更差即更好来为发布有缺陷或有害的软件辩护，因为这个原则是关于简洁性权衡，而非质量放弃",
      "不要将更差即更好应用于安全关键系统（医疗设备、航空、金融基础设施），其中「足够好」的正确性可能导致灾难性伤害",
      "不要忽视在更差即更好系统中积累的技术债务，因为如果代码库随时间变得不可维护，简洁性优势就会消失",
      "不要将更差即更好与反智主义混为一谈，因为正确应用这个原则需要对权衡的深刻理解"
    ],
    "case_study_company": "Unix/C",
    "case_study": "Unix and C are Gabriel's original examples of worse-is-better triumphing. When compared to Multics and Lisp, Unix and C were objectively less capable: C had no garbage collection, no type safety, and manual memory management; Unix had a simpler but less orthogonal design than Multics. However, C was small enough to port to new hardware in weeks rather than years, and Unix was simple enough for a single person to understand the entire kernel. This portability and comprehensibility created a viral adoption loop: Unix spread to every university and hardware platform, accumulating contributions from thousands of developers. By the time 'better' systems like Multics were ready, Unix had already won through sheer ecosystem momentum, proving that implementation simplicity and rapid portability can outweigh theoretical superiority.",
    "case_study_zh": "Unix和C是Gabriel关于更差即更好获胜的原始范例。与Multics和Lisp相比，Unix和C在客观上能力更弱：C没有垃圾回收、没有类型安全、需要手动内存管理；Unix的设计比Multics更简单但正交性更低。然而，C小到可以在数周而非数年内移植到新硬件，Unix简单到单个人可以理解整个内核。这种可移植性和可理解性创造了病毒式采纳循环：Unix传播到每所大学和每个硬件平台，积累了数千名开发者的贡献。当Multics等「更好」的系统准备就绪时，Unix已经通过纯粹的生态系统动量获胜，证明了实现简洁性和快速可移植性可以胜过理论优越性。",
    "when_not_to_use": [
      "When building safety-critical systems where 'good enough' correctness has life-or-death consequences and regulatory compliance demands formal verification",
      "When the domain requires mathematical precision (cryptography, financial settlement, scientific computing) where an incorrect implementation is worse than no implementation",
      "When your users are technical experts who will reject a simplified system that doesn't meet their professional standards (compilers, databases, operating system kernels for specialized hardware)",
      "When you are building infrastructure that other systems will depend on for decades, where early simplicity trade-offs compound into systemic limitations"
    ],
    "when_not_to_use_zh": [
      "当构建安全关键系统，「足够好」的正确性关乎生死且法规合规要求形式化验证时",
      "当领域需要数学精度（密码学、金融清算、科学计算），不正确的实现比没有实现更糟时",
      "当用户是技术专家，他们会拒绝不满足其专业标准的简化系统（编译器、数据库、专用硬件的操作系统内核）时",
      "当你在构建其他系统将依赖数十年的基础设施，早期的简洁性权衡会复合为系统性局限时"
    ],
    "adopters": [
      "Unix/Linux",
      "C Language",
      "HTTP/HTML (early web)",
      "JSON",
      "Go Language"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "portability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Richard P. Gabriel (1989). \"Lisp: Good News, Bad News, How to Win Big\". AI Expert.",
    "secondary_sources": [
      "Richard P. Gabriel (1996). \"Patterns of Software: Tales from the Software Community\". Oxford University Press.",
      "Niklaus Wirth (1995). \"A Plea for Lean Software\". IEEE Computer, 28(2)."
    ],
    "typed_relations": [
      {
        "slug": "first-principles-thinking",
        "type": "complement"
      },
      {
        "slug": "trade-off-sliders",
        "type": "related"
      },
      {
        "slug": "complexity-budget",
        "type": "related"
      }
    ]
  },
  {
    "id": 253,
    "name": "Occam's Razor in Design",
    "name_zh": "奥卡姆剃刀原则（设计应用）",
    "slug": "occams-razor-in-design",
    "category": "thinking",
    "desc": "Among competing design solutions, prefer the simplest one that fully satisfies the requirements",
    "desc_zh": "在相互竞争的设计方案中，优先选择能完全满足需求的最简单方案",
    "steps": [
      "Enumerate all candidate solutions to the design problem without filtering, including both simple and complex options, to ensure the simplest solution is visible and not prematurely discarded",
      "Identify the minimum set of requirements that any solution must satisfy: functional requirements (what it must do), non-functional requirements (performance, reliability), and constraints (team skill, timeline)",
      "For each candidate solution, verify it satisfies all minimum requirements and identify any additional assumptions, dependencies, or entities it introduces beyond what the problem strictly requires",
      "Apply Occam's Razor: select the solution that satisfies all requirements while introducing the fewest additional entities, assumptions, and moving parts — not the most elegant solution, but the most parsimonious one",
      "Document which requirements the chosen solution satisfies and which optional complexity was deliberately omitted, so future teams understand what was simplified and can revisit when complexity becomes justified"
    ],
    "steps_zh": [
      "在不过滤的情况下列举设计问题的所有候选方案，包括简单和复杂的选项，确保最简单的方案可见且不被过早丢弃",
      "识别任何方案必须满足的最小需求集合：功能需求（必须做什么）、非功能需求（性能、可靠性）和约束条件（团队技能、时间线）",
      "对每个候选方案验证其是否满足所有最小需求，并识别其引入的超出问题严格所需的额外假设、依赖或实体",
      "应用奥卡姆剃刀：选择在满足所有需求的同时引入最少额外实体、假设和活动部件的方案——不是最优雅的方案，而是最简约的方案",
      "记录所选方案满足的需求以及被刻意省略的可选复杂性，使未来的团队了解什么被简化了，并在复杂性变得合理时可以重新审视"
    ],
    "ai_relevant": true,
    "viz_type": "pyramid",
    "viz_labels": [
      "Complex",
      "Simplified",
      "Minimal",
      "Essential"
    ],
    "viz_labels_zh": [
      "复杂方案",
      "简化方案",
      "精简方案",
      "最简本质"
    ],
    "related": [
      "worse-is-better",
      "complexity-budget",
      "first-principles-thinking",
      "trade-off-sliders"
    ],
    "tags": [
      "simplicity",
      "parsimony",
      "decision-making",
      "design-philosophy",
      "complexity"
    ],
    "origin_author": "William of Ockham (c. 1287-1347); design application by various authors",
    "origin_source": "Summa Logicae (William of Ockham, c. 1323); applied to software: 'The Art of Unix Programming' (Eric S. Raymond, Addison-Wesley, 2003)",
    "origin_source_zh": "《逻辑大全》（奥卡姆的威廉，约1323年）；软件领域应用：《Unix编程艺术》（Eric S. Raymond，Addison-Wesley，2003）",
    "complexity": "beginner",
    "when_to_use": [
      "When evaluating multiple architectural approaches that all satisfy requirements and you need a principled way to choose between them",
      "When a proposed solution introduces abstractions, patterns, or infrastructure that are not required by any current or near-term requirement",
      "When reviewing code or design proposals where added complexity is justified by theoretical future scenarios rather than demonstrated present need",
      "When debugging a system by choosing the simplest explanation for observed behavior before testing more complex hypotheses"
    ],
    "when_to_use_zh": [
      "当评估多个都满足需求的架构方案，需要有原则的方式在它们之间做出选择时",
      "当提议的方案引入了任何当前或近期需求都不需要的抽象、模式或基础设施时",
      "当审查代码或设计提案，其中增加的复杂性是由理论上的未来场景而非已证明的当前需求来证明的时",
      "当调试系统时，在测试更复杂的假设之前，为观察到的行为选择最简单的解释"
    ],
    "core_concepts": [
      "Parsimony principle: the razor does not mean 'always choose the simplest solution' but 'do not multiply entities beyond necessity' — a more complex solution is justified only when simpler alternatives genuinely fail to meet requirements",
      "Entity multiplication: in software design, 'entities' are abstractions, services, dependencies, configuration points, and moving parts; each additional entity adds cognitive overhead, failure modes, and maintenance cost",
      "Sufficiency threshold: Occam's Razor applies after a solution crosses the sufficiency threshold (satisfies all requirements); among sufficient solutions, prefer the simplest — it does not justify choosing an insufficient simple solution over a sufficient complex one",
      "Explanatory power: in debugging and root cause analysis, Occam's Razor favors the hypothesis that explains all observed symptoms with the fewest independent assumptions, guiding investigation toward the most likely root cause first"
    ],
    "core_concepts_zh": [
      "简约原则：奥卡姆剃刀并不意味着「总是选择最简单的方案」，而是「不要在不必要的情况下增加实体」——只有当更简单的替代方案确实无法满足需求时，更复杂的方案才是合理的",
      "实体乘增：在软件设计中，「实体」是抽象、服务、依赖、配置点和活动部件；每个额外实体都会增加认知开销、故障模式和维护成本",
      "充分性阈值：奥卡姆剃刀在方案越过充分性阈值（满足所有需求）后才适用；在充分的方案中，优先选择最简单的——它不能证明选择不充分的简单方案优于充分的复杂方案",
      "解释力：在调试和根因分析中，奥卡姆剃刀支持用最少独立假设解释所有观察症状的假设，引导调查首先朝向最可能的根本原因"
    ],
    "timeline": [
      [
        "c.1323",
        "William of Ockham articulates 'entia non sunt multiplicanda praeter necessitatem' (entities must not be multiplied beyond necessity) in Summa Logicae, formalizing the parsimony principle"
      ],
      [
        "1960s",
        "Software engineering pioneers including Dijkstra and Wirth apply parsimony to program design; 'structured programming' is partly an expression of Occam's Razor applied to control flow"
      ],
      [
        "2003",
        "Eric S. Raymond's 'The Art of Unix Programming' codifies the Rule of Parsimony as one of 17 Unix design rules, explicitly connecting Occam's Razor to software architecture decisions"
      ],
      [
        "2010s",
        "Lean startup and agile movements reinforce Occam's Razor through YAGNI (You Ain't Gonna Need It) and minimal viable product thinking, applying parsimony to feature decisions"
      ]
    ],
    "timeline_zh": [
      [
        "约1323",
        "奥卡姆的威廉在《逻辑大全》中阐述「实体不应超出必要而增加」，将简约原则系统化"
      ],
      [
        "1960年代",
        "包括Dijkstra和Wirth在内的软件工程先驱将简约原则应用于程序设计；「结构化编程」部分是奥卡姆剃刀应用于控制流的体现"
      ],
      [
        "2003",
        "Eric S. Raymond的《Unix编程艺术》将简约规则编入17条Unix设计原则之一，明确将奥卡姆剃刀与软件架构决策相连"
      ],
      [
        "2010年代",
        "精益创业和敏捷运动通过YAGNI（你不会需要它）和最小可行产品思维强化了奥卡姆剃刀，将简约原则应用于功能决策"
      ]
    ],
    "dos": [
      "Do require that any added complexity demonstrate a concrete, present requirement it satisfies, not a hypothetical future scenario, because most hypothetical requirements never materialize",
      "Do use the razor as a tiebreaker between solutions that all satisfy requirements, not as a way to justify choosing an insufficient solution because it is simpler",
      "Do apply the razor recursively: after choosing the simplest solution, look for simplifications within the chosen solution that preserve all requirements",
      "Do count dependencies, configuration parameters, and abstractions as entities that must be justified, not just classes and services"
    ],
    "dos_zh": [
      "要求任何增加的复杂性证明它满足的是具体的当前需求，而非假设性的未来场景，因为大多数假设性需求从未实现",
      "将奥卡姆剃刀用作满足需求的方案之间的决胜因素，而不是用来证明选择不充分方案因为它更简单的理由",
      "递归应用奥卡姆剃刀：在选择最简单的方案后，在所选方案内寻找保留所有需求的简化空间",
      "将依赖项、配置参数和抽象计为必须证明其合理性的实体，而不仅仅是类和服务"
    ],
    "donts": [
      "Don't conflate Occam's Razor with 'simpler is always better'; a simple solution that fails to meet a required non-functional property (latency, fault tolerance) is not parsimonious, it is incomplete",
      "Don't apply the razor to rule out legitimate complexity driven by genuine requirements such as security, compliance, or reliability constraints that the domain actually imposes",
      "Don't use the razor as a rhetorical device to dismiss architectural concerns without engaging with them; 'this is too complex' is not an application of Occam's Razor without a simpler sufficient alternative",
      "Don't apply the razor only at design time; revisit it during code review and refactoring to remove complexity that was once necessary but is no longer justified by current requirements"
    ],
    "donts_zh": [
      "不要将奥卡姆剃刀与「越简单越好」混为一谈；无法满足必要非功能属性（延迟、容错）的简单方案不是简约，而是不完整",
      "不要将奥卡姆剃刀用于排除由安全、合规或可靠性约束等领域实际施加的真实需求所驱动的合理复杂性",
      "不要将奥卡姆剃刀用作不经过充分讨论就驳回架构关切的修辞手段；「这太复杂了」在没有更简单的充分替代方案时，并不是奥卡姆剃刀的应用",
      "不要只在设计时应用奥卡姆剃刀；在代码审查和重构期间重新审视它，删除曾经必要但当前需求不再证明其合理性的复杂性"
    ],
    "case_study_company": "Amazon Web Services",
    "case_study": "Amazon's S3 API design is a canonical example of Occam's Razor applied to cloud service design. When designing S3 in 2006, Amazon chose the simplest possible object storage model: buckets containing flat key-value pairs with no directory hierarchy, no transactions, and eventually-consistent reads. Every feature request for POSIX filesystem semantics, atomic operations, or strong consistency was rejected as unnecessary complexity for the primary use case of durable object storage. This parsimonious design scaled to trillions of objects and enabled S3 to become the foundational storage primitive for the entire cloud industry.",
    "case_study_zh": "Amazon S3的API设计是奥卡姆剃刀应用于云服务设计的典范。2006年设计S3时，Amazon选择了尽可能简单的对象存储模型：包含扁平键值对的存储桶，没有目录层次结构、没有事务、最终一致性读取。每个要求POSIX文件系统语义、原子操作或强一致性的功能请求都被拒绝，认为对于持久对象存储这个主要用例来说是不必要的复杂性。这种简约的设计扩展到了数万亿个对象，并使S3成为整个云行业的基础存储原语。",
    "when_not_to_use": [
      "When 'simplicity' conflicts with safety in safety-critical domains (aviation, medical devices, financial clearing); correctness and fault tolerance requirements justify complexity that Occam's Razor would otherwise prune",
      "When simplicity is being used to avoid necessary complexity that the problem domain genuinely imposes (e.g., arguing that distributed systems should be 'simpler' when the requirements demand distribution)",
      "When the team is using the principle to resist learning or adopting established patterns that are more complex than familiar approaches but are well-justified",
      "When evaluating solutions at different abstraction levels; the razor applies within a given abstraction level, not across levels (a database is 'more complex' than a text file, but the complexity is justified by its functional capabilities)"
    ],
    "when_not_to_use_zh": [
      "当「简单」与安全关键领域（航空、医疗设备、金融清算）的安全性相冲突时；正确性和容错要求证明了奥卡姆剃刀原本会削减的复杂性的合理性",
      "当简单性被用于回避问题领域真正施加的必要复杂性时（例如：在需求要求分布式的情况下，主张分布式系统应该「更简单」）",
      "当团队使用该原则来抵制学习或采用比熟悉方法更复杂但有充分理由的既定模式时",
      "当在不同抽象层次上评估方案时；奥卡姆剃刀适用于给定抽象层次内，而非跨层次（数据库比文本文件「更复杂」，但其功能能力证明了这种复杂性的合理性）"
    ],
    "adopters": [
      "Amazon Web Services — S3's flat key-value model, SQS's at-least-once queue, and Lambda's stateless function model all embody parsimonious design",
      "Unix/Linux — each tool does one thing well; pipes compose simple tools into complex workflows rather than building monolithic complex tools",
      "JSON — replaced XML for most REST APIs by being a simpler, less expressive format that was sufficient for the majority of use cases",
      "SQLite — deliberately omits features like network access, user management, and stored procedures to remain the simplest embeddable relational database",
      "Go programming language — designed with explicit parsimony (no generics until 1.18, no exceptions, no inheritance) based on Google's experience that simplicity improves large-team maintainability"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Ockham, W. (c. 1323). Summa Logicae. Translated in: Spade, P. V. (1995). Five Texts on the Mediaeval Problem of Universals. Hackett.",
    "secondary_sources": [
      "Raymond, E. S. (2003). The Art of Unix Programming. Addison-Wesley. Chapter 1: The Rule of Parsimony.",
      "Gall, J. (1975). Systemantics: How Systems Work and Especially How They Fail. Quadrangle. (Source of 'A complex system that works is invariably found to have evolved from a simple system that worked.')",
      "Spolsky, J. (2002). 'The Law of Leaky Abstractions.' Joel on Software. joelonsoftware.com."
    ],
    "typed_relations": [
      {
        "slug": "worse-is-better",
        "type": "complement"
      },
      {
        "slug": "complexity-budget",
        "type": "complement"
      },
      {
        "slug": "first-principles-thinking",
        "type": "complement"
      }
    ]
  },
  {
    "id": 254,
    "name": "Architectural Kata",
    "name_zh": "架构卡塔",
    "slug": "architectural-kata",
    "category": "thinking",
    "desc": "Structured practice exercises where architects design systems for fictional scenarios to build architectural intuition and decision-making skills",
    "desc_zh": "架构师为虚构场景设计系统的结构化练习，用于培养架构直觉和决策能力",
    "steps": [
      "Select or create a kata scenario: a concise description of a fictional business problem with users, scale requirements, and constraints — similar to a product brief but designed to make architectural trade-offs explicit and interesting",
      "Form small groups (3-5 architects or engineers) and independently design the architecture for the given scenario within a time-boxed period (typically 45-90 minutes), documenting key decisions and their rationale",
      "Present the architectural design to peer groups or the broader audience, explicitly defending each major decision: which architectural characteristics were prioritized, which trade-offs were made, and what alternatives were considered",
      "Receive and give structured critique: reviewers challenge design decisions with 'what happens when X?' questions, and the presenting team must defend choices or acknowledge valid concerns they missed",
      "Conduct a debrief comparing different groups' architectures for the same scenario, discussing which approaches addressed requirements better and what the architectural lessons are that generalize beyond the specific kata"
    ],
    "steps_zh": [
      "选择或创建卡塔场景：对虚构业务问题的简明描述，包含用户、规模需求和约束条件——类似于产品简报，但旨在使架构权衡明确且有趣",
      "组成小组（3-5名架构师或工程师），在限定时间内（通常45-90分钟）独立为给定场景设计架构，记录关键决策及其依据",
      "向同组或更广泛的受众展示架构设计，明确捍卫每个主要决策：优先考虑了哪些架构特性、做出了哪些权衡、考虑了哪些替代方案",
      "接受和给予结构化的批评：审查者用「如果X会怎样？」的问题挑战设计决策，展示团队必须捍卫选择或承认他们遗漏的有效关切",
      "进行复盘，比较不同小组对同一场景的架构，讨论哪些方法更好地满足了需求，以及超越特定卡塔的可推广架构经验"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Problem",
      "Constraints",
      "Design",
      "Review",
      "Iterate"
    ],
    "viz_labels_zh": [
      "问题定义",
      "约束条件",
      "方案设计",
      "评审反馈",
      "迭代改进"
    ],
    "related": [
      "occams-razor-in-design",
      "theory-of-constraints",
      "first-principles-thinking",
      "trade-off-sliders"
    ],
    "tags": [
      "architecture",
      "practice",
      "learning",
      "deliberate-practice",
      "decision-making"
    ],
    "origin_author": "Ted Neward, 2009",
    "origin_source": "Ted Neward (2009). 'Architectural Katas.' tednew ard.com; Neal Ford, Rebecca Parsons & Patrick Kua (2020). 'Building Evolutionary Architectures.' O'Reilly.",
    "origin_source_zh": "Ted Neward（2009）《架构卡塔》tednew ard.com；Neal Ford、Rebecca Parsons & Patrick Kua（2020）《演进式架构》O'Reilly",
    "complexity": "intermediate",
    "when_to_use": [
      "When onboarding engineers to architecture responsibilities and you need a low-stakes environment to practice architectural decision-making",
      "When a team has been writing code for years but lacks experience making and defending system-level design decisions",
      "When preparing for architecture review boards, technical interviews, or staff+ engineering promotions that require demonstrating architectural thinking",
      "When a team is about to make a real high-stakes architectural decision and wants to warm up their thinking by practicing on a similar fictional scenario first"
    ],
    "when_to_use_zh": [
      "当将工程师引入架构职责时，需要一个低风险环境来练习架构决策",
      "当团队已编写多年代码但缺乏做出和捍卫系统级设计决策的经验时",
      "当准备架构评审委员会、技术面试或需要展示架构思维的高级工程师晋升时",
      "当团队即将做出真实的高风险架构决策，并希望通过先在类似虚构场景上练习来预热思维时"
    ],
    "core_concepts": [
      "Deliberate practice for architecture: architectural skills are developed through repeated cycles of design, presentation, and critique on varied problem domains — the kata format structures this deliberate practice cycle",
      "Architectural characteristics: the first step of any kata is identifying which '-ilities' (scalability, availability, security, maintainability, performance) are most important for the scenario, because these drive all subsequent trade-off decisions",
      "Trade-off articulation: the kata format forces architects to explicitly state trade-offs rather than design by instinct; a good kata presentation names what was sacrificed to achieve each architectural goal",
      "Safe failure environment: katas are fictional, so the cost of a wrong decision is learning, not a production incident; this psychological safety allows architects to take design risks they would avoid in high-stakes real contexts"
    ],
    "core_concepts_zh": [
      "架构的刻意练习：架构技能通过在不同问题领域反复设计、展示和批评的循环来培养——卡塔格式构建了这种刻意练习循环",
      "架构特性：任何卡塔的第一步都是识别哪些「-ility」（可扩展性、可用性、安全性、可维护性、性能）对场景最重要，因为这些驱动了所有后续的权衡决策",
      "权衡阐明：卡塔格式迫使架构师明确陈述权衡，而非凭直觉设计；好的卡塔展示会说明为了实现每个架构目标而牺牲了什么",
      "安全的失败环境：卡塔是虚构的，所以错误决策的代价是学习，而非生产事故；这种心理安全感使架构师能够在高风险真实环境中会回避的设计风险"
    ],
    "timeline": [
      [
        "2009",
        "Ted Neward introduces the Architectural Kata concept at the NFJS (No Fluff Just Stuff) symposium series, borrowing the martial arts kata metaphor to describe structured architecture practice"
      ],
      [
        "2012",
        "Neward publishes a curated list of kata scenarios online; the format is adopted by coding bootcamps, university programs, and corporate architecture guilds as a training tool"
      ],
      [
        "2017",
        "Neal Ford and Rebecca Parsons incorporate architectural katas into their O'Reilly 'Fundamentals of Software Architecture' course material, significantly broadening adoption"
      ],
      [
        "2020",
        "Remote-work acceleration drives virtual kata workshops via Miro and collaborative whiteboarding tools; kata formats expand to include AI system design and cloud-native architecture scenarios"
      ]
    ],
    "timeline_zh": [
      [
        "2009",
        "Ted Neward在NFJS（No Fluff Just Stuff）研讨会上引入架构卡塔概念，借用武术套路的比喻来描述结构化架构练习"
      ],
      [
        "2012",
        "Neward在线发布精选卡塔场景列表；该格式被编程训练营、大学项目和企业架构协会采用作为培训工具"
      ],
      [
        "2017",
        "Neal Ford和Rebecca Parsons将架构卡塔纳入其O'Reilly《软件架构基础》课程材料，显著扩大了采用范围"
      ],
      [
        "2020",
        "远程工作加速推动了通过Miro和协作白板工具进行虚拟卡塔研讨；卡塔格式扩展到包括AI系统设计和云原生架构场景"
      ]
    ],
    "dos": [
      "Do time-box the design phase strictly (45-90 minutes) because open-ended design time leads to over-engineering; the kata's value comes from making decisions under realistic constraints",
      "Do require each group to explicitly list the architectural characteristics they prioritized before presenting the design, because prioritization is the highest-leverage architectural decision",
      "Do ask 'what would break this architecture?' after each presentation because stress-testing designs for failure modes is more valuable than critiquing aesthetic choices",
      "Do run the same kata scenario with multiple groups and compare approaches, because the diversity of valid solutions reveals the design space and demonstrates that architecture involves genuine trade-offs rather than single right answers"
    ],
    "dos_zh": [
      "严格限制设计阶段时间（45-90分钟），因为开放式设计时间会导致过度工程化；卡塔的价值来自在现实约束下做出决策",
      "要求每个小组在展示设计之前明确列出他们优先考虑的架构特性，因为优先级排序是最高杠杆的架构决策",
      "在每次展示后询问「什么会破坏这个架构？」，因为对故障模式进行压力测试比批评审美选择更有价值",
      "用多个小组运行相同的卡塔场景并比较方法，因为有效方案的多样性揭示了设计空间，并表明架构涉及真实的权衡而非单一正确答案"
    ],
    "donts": [
      "Don't critique kata designs for implementation details (code style, specific library choices) because the practice targets system-level decision-making, not implementation decisions",
      "Don't let groups spend more than 10 minutes on requirements clarification; ambiguity in kata scenarios is intentional because real architectural decisions are always made under incomplete information",
      "Don't declare one group's design the 'winner'; the point is to learn from the diversity of approaches and understand which trade-offs different designs make",
      "Don't use kata scenarios that are too narrowly scoped to a single domain or too abstractly philosophical; the best katas have 3-5 distinct architectural tensions that require explicit trade-off decisions"
    ],
    "donts_zh": [
      "不要批评卡塔设计的实现细节（代码风格、特定库选择），因为练习针对的是系统级决策，而非实现决策",
      "不要让小组在需求澄清上花费超过10分钟；卡塔场景中的模糊性是有意为之的，因为真实的架构决策总是在信息不完整的情况下做出的",
      "不要宣布某个小组的设计为「赢家」；重点是从多种方法的多样性中学习，了解不同设计做出了哪些权衡",
      "不要使用范围过于狭窄于单一领域或过于抽象哲学化的卡塔场景；最好的卡塔有3-5个需要明确权衡决策的不同架构张力"
    ],
    "case_study_company": "O'Reilly Media / ThoughtWorks",
    "case_study": "O'Reilly's online learning platform runs Architectural Kata workshops as part of their 'Fundamentals of Software Architecture' course led by Neal Ford and Mark Richards. In live course sessions, hundreds of engineers are divided into groups of 4-5, given a kata scenario such as 'design a nationwide road warrior trip tracking system for 2 million users', and have 45 minutes to design and present an architecture. The format has become one of the most popular exercises in the course, with learners reporting that defending their designs under peer questioning dramatically accelerates architectural intuition compared to reading about patterns alone.",
    "case_study_zh": "O'Reilly在线学习平台将架构卡塔研讨会作为Neal Ford和Mark Richards主讲的《软件架构基础》课程的一部分。在现场课程中，数百名工程师被分成4-5人小组，获得卡塔场景（如「为200万用户设计全国路战士行程追踪系统」），并有45分钟来设计和展示架构。这种格式已成为课程中最受欢迎的练习之一，学员反映，在同伴质疑下捍卫自己的设计比单独阅读模式更能显著加速架构直觉的培养。",
    "when_not_to_use": [
      "When the team is under immediate deadline pressure for a real system; kata exercises require dedicated uninterrupted time and a learning mindset, not a delivery mindset",
      "When participants have fewer than 2 years of software engineering experience; architectural katas require enough implementation experience to reason about trade-offs between real system properties",
      "When the organizational culture does not support open critique and intellectual disagreement; katas require psychological safety to challenge and be challenged without status games",
      "When you need to make a real architectural decision quickly; katas are deliberate practice, not a decision-making process for real systems"
    ],
    "when_not_to_use_zh": [
      "当团队面临真实系统的紧迫截止期限时；卡塔练习需要专门的不间断时间和学习心态，而非交付心态",
      "当参与者软件工程经验不足2年时；架构卡塔需要足够的实现经验来推理真实系统属性之间的权衡",
      "当组织文化不支持开放批评和知识分歧时；卡塔需要心理安全感来挑战和被挑战，没有地位游戏",
      "当需要快速做出真实架构决策时；卡塔是刻意练习，而非真实系统的决策流程"
    ],
    "adopters": [
      "O'Reilly Media — 'Fundamentals of Software Architecture' course by Neal Ford and Mark Richards uses katas as the primary hands-on learning mechanism",
      "ThoughtWorks — uses architectural katas in internal architecture community of practice workshops and new hire technical onboarding",
      "NFJS (No Fluff Just Stuff) — the conference series where Ted Neward originated the kata format; continues to feature kata workshops",
      "Corporate architecture guilds at Netflix, Spotify, and Zalando use kata formats for quarterly architecture practice sessions",
      "Coding bootcamps (General Assembly, Hack Reactor) adapted kata formats for senior cohorts to introduce system design thinking"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Neward, T. (2009). 'Architectural Katas.' tednew ard.com. (Original presentation at NFJS symposium.)",
    "secondary_sources": [
      "Ford, N. & Richards, M. (2020). Fundamentals of Software Architecture. O'Reilly Media. Chapter 19: Architecture Decisions.",
      "Ford, N., Parsons, R. & Kua, P. (2017). Building Evolutionary Architectures. O'Reilly Media.",
      "Rozanski, N. & Woods, E. (2011). Software Systems Architecture (2nd ed.). Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "first-principles-thinking",
        "type": "complement"
      },
      {
        "slug": "trade-off-sliders",
        "type": "complement"
      },
      {
        "slug": "occams-razor-in-design",
        "type": "related"
      },
      {
        "slug": "theory-of-constraints",
        "type": "related"
      }
    ]
  },
  {
    "id": 255,
    "name": "Theory of Constraints (TOC)",
    "name_zh": "约束理论（TOC）",
    "slug": "theory-of-constraints",
    "category": "thinking",
    "desc": "Identify and systematically exploit the binding constraint in a system to maximize throughput, then elevate or break it",
    "desc_zh": "识别并系统性地利用系统中的制约因素以最大化吞吐量，然后提升或打破它",
    "steps": [
      "Identify the constraint: find the single resource, process step, or policy that limits the system's throughput most — the constraint is the step where work piles up, has the longest queue, or determines the overall system capacity",
      "Exploit the constraint: maximize the output of the constraint using existing resources before spending on additional capacity — eliminate waste, reduce downtime, and ensure the constraint is never starved by upstream steps",
      "Subordinate everything else: adjust all non-constraint steps and policies to support the constraint's maximum output; if upstream processes are faster than the constraint, slow them to prevent WIP buildup that overwhelms the constraint",
      "Elevate the constraint: if exploiting and subordinating are insufficient, invest in additional capacity (people, machines, process redesign) specifically targeted at the constraint — not distributed across all steps",
      "Prevent inertia: after the constraint is broken and moves to a new location in the system, return to step 1 and identify the new constraint; do not let previous solutions become new constraints through outdated policies or local optimization"
    ],
    "steps_zh": [
      "识别约束：找到限制系统吞吐量最多的单一资源、流程步骤或政策——约束是工作积压、队列最长或决定整体系统产能的步骤",
      "充分利用约束：在增加额外产能之前，使用现有资源最大化约束的产出——消除浪费、减少停机时间，确保约束永远不会被上游步骤饿死",
      "将其他一切从属于约束：调整所有非约束步骤和政策以支持约束的最大产出；如果上游流程比约束快，就放慢它们以防止WIP积累压垮约束",
      "提升约束：如果充分利用和从属化仍然不足，专门针对约束投资额外产能（人员、机器、流程重设计）——而非分散在所有步骤",
      "防止惰性：约束被打破并转移到系统新位置后，返回步骤1识别新约束；不要因过时的政策或局部优化让以前的解决方案成为新约束"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Identify",
      "Exploit",
      "Subordinate",
      "Elevate",
      "Repeat"
    ],
    "viz_labels_zh": [
      "识别瓶颈",
      "充分利用",
      "从属调整",
      "提升瓶颈",
      "持续迭代"
    ],
    "related": [
      "architectural-kata",
      "trade-off-sliders"
    ],
    "tags": [
      "constraints",
      "throughput",
      "bottleneck",
      "systems-thinking",
      "goldratt"
    ],
    "origin_author": "Eliyahu M. Goldratt, 1984",
    "origin_source": "The Goal: A Process of Ongoing Improvement (Eliyahu M. Goldratt & Jeff Cox, North River Press, 1984); Theory of Constraints (Goldratt, 1990)",
    "origin_source_zh": "《目标：持续改善的过程》（Eliyahu M. Goldratt & Jeff Cox，North River Press，1984）；《约束理论》（Goldratt，1990）",
    "complexity": "intermediate",
    "when_to_use": [
      "When a system or workflow has a known bottleneck and you want a principled method to decide where to invest improvement effort rather than distributing it evenly across all steps",
      "When a software delivery process has a persistent wait state (code review backlog, deployment queue, QA handoff delay) that limits overall feature throughput",
      "When performance optimization efforts have been spread across many services and individual improvements have not produced system-level throughput gains",
      "When stakeholders are requesting parallel investments to improve multiple steps simultaneously and you need a framework to argue for focused investment on the constraint"
    ],
    "when_to_use_zh": [
      "当系统或工作流存在已知瓶颈，需要有原则的方法来决定在哪里投入改进精力，而非将其平均分配到所有步骤时",
      "当软件交付过程存在持续的等待状态（代码审查积压、部署队列、QA交接延迟），限制了整体功能吞吐量时",
      "当性能优化工作分散在许多服务中，单独的改进没有产生系统级吞吐量提升时",
      "当利益相关者要求并行投资同时改进多个步骤，需要一个框架来论证对约束进行集中投资时"
    ],
    "core_concepts": [
      "Throughput: the rate at which the system generates value (revenue, features shipped, requests served); TOC optimizes for system throughput, not local efficiency — improving a non-constraint step cannot increase system throughput if the constraint is unchanged",
      "Constraint (bottleneck): the single step that limits overall throughput; every system has exactly one binding constraint at any given time; all other steps have excess capacity relative to the constraint's output rate",
      "Inventory and WIP: work-in-progress that accumulates upstream of the constraint is 'inventory' in TOC; excess WIP adds cost, complexity, and lead time without increasing throughput",
      "The five focusing steps: Identify → Exploit → Subordinate → Elevate → Repeat; this cycle is continuous because breaking one constraint reveals the next one, and the system always has a constraint somewhere"
    ],
    "core_concepts_zh": [
      "吞吐量：系统产生价值的速率（收入、已交付功能、已服务请求）；TOC优化系统吞吐量而非局部效率——如果约束不变，改进非约束步骤无法提升系统吞吐量",
      "约束（瓶颈）：限制整体吞吐量的单一步骤；在任何给定时间，每个系统都恰好有一个绑定约束；相对于约束的产出率，所有其他步骤都有过剩产能",
      "库存和WIP：在约束上游积累的在制品在TOC中被称为「库存」；过多的WIP会增加成本、复杂性和前置时间，而不会增加吞吐量",
      "五个聚焦步骤：识别→利用→从属→提升→重复；这个循环是持续的，因为打破一个约束会揭示下一个约束，系统总是在某处存在约束"
    ],
    "timeline": [
      [
        "1984",
        "Eliyahu Goldratt and Jeff Cox publish 'The Goal' as a business novel set in a manufacturing plant, making TOC principles accessible to non-academic audiences; the book sells over 7 million copies"
      ],
      [
        "1990",
        "Goldratt publishes 'Theory of Constraints' as a formal management book, extending the five focusing steps from manufacturing to project management, supply chain, and distribution"
      ],
      [
        "1997",
        "Goldratt publishes 'Critical Chain', applying TOC to project management scheduling through buffer management and dependency-awareness, influencing agile and lean project methods"
      ],
      [
        "2004",
        "Mike Rother and John Shook's 'Learning to See' and Lean thinking integrate TOC bottleneck analysis with value stream mapping; TOC becomes foundational to DevOps and software delivery optimization"
      ]
    ],
    "timeline_zh": [
      [
        "1984",
        "Eliyahu Goldratt和Jeff Cox将《目标》作为以制造工厂为背景的商业小说出版，使TOC原则面向非学术受众；该书销量超过700万册"
      ],
      [
        "1990",
        "Goldratt出版《约束理论》作为正式管理书籍，将五个聚焦步骤从制造业扩展到项目管理、供应链和分销"
      ],
      [
        "1997",
        "Goldratt出版《关键链》，通过缓冲区管理和依赖关系意识将TOC应用于项目管理调度，影响了敏捷和精益项目方法"
      ],
      [
        "2004",
        "Mike Rother和John Shook的《精益思想》以及精益思维将TOC瓶颈分析与价值流图相结合；TOC成为DevOps和软件交付优化的基础"
      ]
    ],
    "dos": [
      "Do measure throughput at the system level (features delivered per sprint, deployments per day) before and after improvements, because local efficiency gains only matter if they improve system-level throughput",
      "Do protect the constraint from starvation by ensuring it always has work ready to process; idle constraint time is the most expensive form of waste in a TOC system",
      "Do reduce WIP limits upstream of the constraint using Kanban-style WIP caps, because accumulating work-in-progress upstream increases context switching, lead time, and coordination cost without increasing output",
      "Do use buffer management at the constraint: maintain a small upstream buffer to absorb variability in upstream step throughput, preventing constraint idle time from statistical randomness"
    ],
    "dos_zh": [
      "在改进前后在系统层面衡量吞吐量（每个冲刺交付的功能、每天的部署次数），因为局部效率提升只有在改善系统级吞吐量时才重要",
      "通过确保约束始终有待处理的工作来防止约束被饿死；约束空闲时间是TOC系统中最昂贵的浪费形式",
      "使用看板式WIP上限减少约束上游的WIP限制，因为在上游积累在制品会增加上下文切换、前置时间和协调成本，而不会增加产出",
      "在约束处使用缓冲区管理：维持小型上游缓冲区以吸收上游步骤吞吐量的变异性，防止约束因统计随机性而空闲"
    ],
    "donts": [
      "Don't optimize non-constraints independently of the constraint; improving the throughput of a non-constraint step cannot increase system throughput if the constraint remains unchanged, and may increase WIP and cost",
      "Don't add resources uniformly across all steps when capacity is needed; investment must be targeted specifically at the constraint to improve system throughput",
      "Don't treat the constraint as permanent; the five-step cycle must continue after breaking a constraint because the constraint moves to a new location and a new improvement cycle begins",
      "Don't measure success by local efficiency metrics (individual utilization rates, step-level cycle times) because 100% utilization of non-constraints is a sign of over-production, not efficiency"
    ],
    "donts_zh": [
      "不要独立于约束优化非约束步骤；如果约束保持不变，提高非约束步骤的吞吐量无法提升系统吞吐量，反而可能增加WIP和成本",
      "当需要产能时，不要在所有步骤上均匀增加资源；必须专门针对约束进行投资才能提升系统吞吐量",
      "不要将约束视为永久性的；打破约束后必须继续五步循环，因为约束会移动到新位置，新的改进周期开始",
      "不要用局部效率指标（个人利用率、步骤级周期时间）来衡量成功，因为非约束步骤100%的利用率是过度生产的迹象，而非效率"
    ],
    "case_study_company": "Gene Kim / Phoenix Project (DevOps)",
    "case_study": "Gene Kim, Kevin Behr, and George Spafford's 'The Phoenix Project' (2013) is explicitly modeled on Goldratt's 'The Goal', applying TOC to an IT organization's software delivery value stream. In the narrative, the constraint is identified as Brent, a single expert engineer who is the bottleneck for every critical deployment and incident resolution. The TOC solution is to identify Brent as the constraint, exploit him by documenting his knowledge (so others can absorb non-critical work), subordinate all other teams to protect Brent's time for constraint work, and elevate by cross-training. This framing directly influenced the DevOps movement's focus on deployment pipeline constraints and DORA metrics.",
    "case_study_zh": "Gene Kim、Kevin Behr和George Spafford的《凤凰项目》（2013）明确以Goldratt的《目标》为模型，将TOC应用于IT组织的软件交付价值流。在叙述中，约束被识别为Brent——一个在每次关键部署和事故解决中都是瓶颈的单一专家工程师。TOC解决方案是将Brent识别为约束，通过记录他的知识（使他人能够承担非关键工作）来充分利用他，将所有其他团队从属化以保护Brent的时间用于约束工作，并通过交叉培训来提升。这一框架直接影响了DevOps运动对部署管道约束和DORA指标的关注。",
    "when_not_to_use": [
      "When the system has no measurable throughput metric and optimization goals are ambiguous; TOC requires a clear definition of what throughput means for the system being optimized",
      "When the system is creative or knowledge work where the 'constraint' is not a process step but a quality of thinking that cannot be accelerated by adding more resources",
      "When the organization optimizes for resource utilization rather than throughput; TOC deliberately leaves non-constraints with idle capacity, which conflicts with utilization-focused management cultures",
      "When multiple simultaneous constraints exist (a system near capacity on several steps), because TOC's five-step cycle assumes a single binding constraint that can be isolated and improved"
    ],
    "when_not_to_use_zh": [
      "当系统没有可衡量的吞吐量指标且优化目标模糊时；TOC需要对被优化系统的吞吐量含义有清晰定义",
      "当系统是创意或知识工作，其中「约束」不是流程步骤而是无法通过增加资源来加速的思考质量时",
      "当组织优化资源利用率而非吞吐量时；TOC刻意让非约束步骤拥有空闲产能，这与以利用率为导向的管理文化相冲突",
      "当存在多个同时约束（系统在多个步骤上接近产能）时，因为TOC的五步循环假设存在可以被隔离和改进的单一绑定约束"
    ],
    "adopters": [
      "Intel — applied TOC to semiconductor manufacturing scheduling to reduce cycle times and increase fab throughput in the 1990s",
      "Boeing — used TOC in aircraft manufacturing assembly lines to identify and break constraints in component delivery sequencing",
      "Amazon — applies throughput optimization principles from TOC in their fulfillment center operations and software deployment pipelines",
      "DevOps community — 'The Phoenix Project' and 'The DevOps Handbook' apply TOC to software delivery, making constraint identification central to engineering team improvement",
      "Atlassian — Jira's WIP limits feature in Jira Software and Kanban boards are directly influenced by TOC's subordination principle"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "performance",
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Goldratt, E. M. & Cox, J. (1984). The Goal: A Process of Ongoing Improvement. North River Press.",
    "secondary_sources": [
      "Goldratt, E. M. (1990). Theory of Constraints. North River Press.",
      "Kim, G., Behr, K. & Spafford, G. (2013). The Phoenix Project: A Novel About IT, DevOps, and Helping Your Business Win. IT Revolution Press.",
      "Rother, M. & Shook, J. (2003). Learning to See: Value Stream Mapping to Create Value and Eliminate Muda. Lean Enterprise Institute."
    ],
    "typed_relations": [
      {
        "slug": "trade-off-sliders",
        "type": "complement"
      },
      {
        "slug": "architectural-kata",
        "type": "related"
      }
    ]
  },
  {
    "id": 15,
    "name": "Architecture Decision Records (ADR)",
    "name_zh": "架构决策记录",
    "slug": "adr",
    "category": "architecture",
    "desc": "Lightweight docs capturing context and rationale for decisions",
    "desc_zh": "轻量级文档，记录架构决策的背景与理由",
    "steps": [
      "Identify the architectural decision that needs to be made and create a new ADR file with a sequential number and short title",
      "Document the context: describe the forces at play, constraints, and the problem being addressed",
      "List all options considered with their pros and cons, including the status (proposed, accepted, deprecated)",
      "Record the decision made, the rationale behind it, and which option was chosen",
      "Capture consequences: expected outcomes, trade-offs accepted, and follow-up actions or future ADRs triggered"
    ],
    "steps_zh": [
      "确定需要记录的架构决策，创建带有序号和简短标题的新ADR文件",
      "记录上下文：描述当前的约束条件、驱动力和需要解决的问题",
      "列出所有考虑过的方案及各自的优缺点，标注当前状态（建议中/已接受/已废弃）",
      "记录最终决策、决策理由以及选择该方案的依据",
      "记录后果：预期结果、已接受的权衡取舍，以及后续行动或触发的新ADR"
    ],
    "ai_relevant": true,
    "viz_type": "timeline",
    "viz_labels": [
      "Context",
      "Decision",
      "Consequences",
      "Status"
    ],
    "viz_labels_zh": [
      "背景",
      "决策",
      "结果影响",
      "状态"
    ],
    "related": [
      "continuous-architecture",
      "atam",
      "c4-model"
    ],
    "tags": [
      "documentation",
      "decisions",
      "rationale",
      "governance"
    ],
    "origin_author": "Michael Nygard, 2011",
    "origin_source": "Documenting Architecture Decisions (blog post on Cognitect)",
    "origin_source_zh": "《记录架构决策》（Cognitect博客文章）",
    "complexity": "beginner",
    "when_to_use": [
      "When a team needs to preserve the reasoning behind key architectural choices for future developers",
      "When multiple stakeholders disagree and a transparent decision log is needed",
      "When onboarding new team members who need to understand why the system is built a certain way",
      "When working in regulated industries that require auditable decision trails"
    ],
    "when_to_use_zh": [
      "当团队需要为未来开发者保留关键架构选择背后的理由时",
      "当多个利益相关者存在分歧，需要透明的决策日志时",
      "当新成员加入团队，需要了解系统为何如此构建时",
      "当在受监管行业工作，需要可审计的决策记录时"
    ],
    "core_concepts": [
      "Immutability: ADRs are append-only; decisions are never edited, only superseded by new ADRs",
      "Context capture: Recording the forces, constraints, and environment that shaped the decision",
      "Status lifecycle: Each ADR transitions through proposed, accepted, deprecated, or superseded states",
      "Consequence tracking: Explicitly documenting both positive outcomes and accepted trade-offs",
      "Lightweight format: Markdown files stored alongside code in version control for easy discoverability"
    ],
    "core_concepts_zh": [
      "不可变性：ADR采用追加模式，决策不会被修改，只会被新的ADR取代",
      "上下文捕获：记录影响决策的驱动力、约束条件和环境因素",
      "状态生命周期：每条ADR经历建议中、已接受、已废弃或已取代的状态流转",
      "后果追踪：明确记录正面结果和已接受的权衡取舍",
      "轻量格式：以Markdown文件存储在版本控制中，与代码并置以便于发现"
    ],
    "timeline": [
      [
        "2011",
        "Michael Nygard publishes the original blog post proposing ADRs"
      ],
      [
        "2016",
        "Nat Pryce creates the adr-tools command-line utility for managing ADRs"
      ],
      [
        "2017",
        "ThoughtWorks Technology Radar recommends ADRs as an 'Adopt' technique"
      ],
      [
        "2020",
        "ADRs become widely adopted in cloud-native and microservices communities"
      ],
      [
        "2022",
        "Log4brains and other ADR management tools emerge for team-scale usage"
      ]
    ],
    "timeline_zh": [
      [
        "2011",
        "Michael Nygard发表原始博客文章提出ADR概念"
      ],
      [
        "2016",
        "Nat Pryce创建adr-tools命令行工具用于管理ADR"
      ],
      [
        "2017",
        "ThoughtWorks技术雷达将ADR推荐为「采纳」级别技术"
      ],
      [
        "2020",
        "ADR在云原生和微服务社区中被广泛采用"
      ],
      [
        "2022",
        "Log4brains等ADR管理工具涌现，支持团队规模使用"
      ]
    ],
    "dos": [
      "Do number ADRs sequentially and give them short descriptive titles because it aids navigation and referencing",
      "Do store ADRs in the same repository as the code because it keeps decisions close to the artifacts they affect",
      "Do write ADRs at the time of the decision because context and rationale fade from memory quickly",
      "Do link related ADRs together because it creates a traceable decision graph"
    ],
    "dos_zh": [
      "按顺序编号并给ADR起简短的描述性标题，便于导航和引用",
      "将ADR存储在与代码相同的仓库中，让决策与其影响的产物保持紧密关联",
      "在做出决策时立即编写ADR，因为上下文和理由会很快被遗忘",
      "将相关的ADR相互链接，形成可追溯的决策图谱"
    ],
    "donts": [
      "Don't edit accepted ADRs retroactively because it destroys the historical record of what was actually decided",
      "Don't record every minor implementation detail because ADRs should focus on architecturally significant decisions",
      "Don't skip the context section because without it the decision loses its rationale over time",
      "Don't use ADRs as a replacement for discussion because they document outcomes, not the debate itself"
    ],
    "donts_zh": [
      "不要事后修改已接受的ADR，因为这会破坏实际决策的历史记录",
      "不要记录每个细小的实现细节，ADR应聚焦于架构层面的重要决策",
      "不要跳过上下文部分，缺少上下文会导致决策理由随时间流失",
      "不要用ADR替代讨论过程，ADR记录的是结论而非辩论本身"
    ],
    "case_study_company": "Spotify",
    "case_study": "Spotify adopted ADRs across its autonomous squad model to solve the problem of architectural knowledge loss when engineers rotated between squads. By storing ADRs in each service repository, new squad members could quickly understand past decisions. This reduced onboarding time for new squad members by roughly 40% and prevented teams from revisiting decisions that had already been thoroughly evaluated.",
    "case_study_zh": "Spotify在其自治小队模型中全面采用ADR，解决工程师在小队间轮换时架构知识丢失的问题。通过在每个服务仓库中存储ADR，新小队成员能快速理解过去的决策。这使新成员的上手时间缩短约40%，并避免了团队重复讨论已经充分评估过的决策。",
    "when_not_to_use": [
      "Trivial decisions that don't affect system architecture or are easily reversible",
      "Solo projects where decision context exists only in one person's head and the project has no future maintainers",
      "Extremely fast-moving prototypes where the entire architecture may be discarded within weeks"
    ],
    "when_not_to_use_zh": [
      "不影响系统架构或容易撤销的琐碎决策",
      "决策上下文仅存在于一个人头脑中且项目没有未来维护者的个人项目",
      "整体架构可能在数周内被完全丢弃的快速原型项目"
    ],
    "adopters": [
      "Spotify",
      "GitHub",
      "GOV.UK",
      "Shopify",
      "ThoughtWorks"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Michael Nygard (2011). \"Documenting Architecture Decisions\". Cognitect Blog (cognitect.com).",
    "secondary_sources": [
      "Len Bass, Paul Clements, and Rick Kazman (2021). \"Software Architecture in Practice, 4th Edition\". Addison-Wesley.",
      "Joel Parker Henderson (2023). \"Architecture Decision Record (ADR)\". github.com/joelparkerhenderson/architecture-decision-record."
    ],
    "typed_relations": [
      {
        "slug": "continuous-architecture",
        "type": "complement"
      },
      {
        "slug": "atam",
        "type": "complement"
      },
      {
        "slug": "c4-model",
        "type": "complement"
      }
    ]
  },
  {
    "id": 16,
    "name": "Architecture Tradeoff Analysis Method (ATAM)",
    "name_zh": "架构权衡分析法",
    "slug": "atam",
    "category": "architecture",
    "desc": "Structured method to evaluate architecture against quality goals",
    "desc_zh": "系统评估架构对质量目标的结构化分析方法",
    "steps": [
      "Present the ATAM process and collect business drivers, key architectural goals, and stakeholder concerns",
      "Have the architect present the architecture using multiple views (module, runtime, deployment)",
      "Identify architectural approaches and map them to quality attribute utility tree scenarios",
      "Analyze each approach for sensitivity points, trade-off points, and risks vs. non-risks",
      "Prioritize scenarios and risks, produce a findings report with sensitivity and trade-off point summaries"
    ],
    "steps_zh": [
      "介绍ATAM流程，收集业务驱动因素、关键架构目标和利益相关者关切",
      "由架构师使用多个视图（模块、运行时、部署）呈现架构",
      "识别架构方法，将其映射到质量属性效用树场景",
      "分析每种方法的敏感点、权衡点以及风险与非风险项",
      "对场景和风险进行优先级排序，生成包含敏感点和权衡点摘要的分析报告"
    ],
    "ai_relevant": false,
    "viz_type": "radar",
    "viz_labels": [
      "Performance",
      "Availability",
      "Security",
      "Modifiability",
      "Usability"
    ],
    "viz_labels_zh": [
      "性能",
      "可用性",
      "安全",
      "可修改性",
      "易用性"
    ],
    "related": [
      "qaw",
      "trade-off-sliders",
      "adr"
    ],
    "tags": [
      "evaluation",
      "trade-offs",
      "quality-attributes",
      "stakeholders"
    ],
    "origin_author": "Rick Kazman, Mark Klein, Paul Clements / SEI, 2000",
    "origin_source": "ATAM: Method for Architecture Evaluation (SEI Technical Report CMU/SEI-2000-TR-004)",
    "origin_source_zh": "《ATAM：架构评估方法》（SEI技术报告CMU/SEI-2000-TR-004）",
    "complexity": "advanced",
    "when_to_use": [
      "When evaluating a critical system architecture before committing to implementation",
      "When multiple quality attributes conflict and explicit trade-off analysis is needed",
      "When stakeholders need confidence that the architecture meets key business requirements",
      "When regulatory or safety-critical systems require documented architecture risk assessment"
    ],
    "when_to_use_zh": [
      "在进入实现阶段之前评估关键系统架构时",
      "当多个质量属性相互冲突，需要明确的权衡分析时",
      "当利益相关者需要确信架构满足关键业务需求时",
      "当法规或安全关键系统需要文档化的架构风险评估时"
    ],
    "core_concepts": [
      "Utility tree: Hierarchical decomposition of quality attributes into prioritized, testable scenarios",
      "Sensitivity point: An architectural parameter where a small change significantly affects a quality attribute",
      "Trade-off point: An architectural decision that affects two or more quality attributes in opposing directions",
      "Risk and non-risk: Categorization of architectural decisions based on their potential for negative outcomes",
      "Architectural approach: A specific design strategy or pattern used to address quality requirements"
    ],
    "core_concepts_zh": [
      "效用树：将质量属性层级分解为有优先级的可测试场景",
      "敏感点：一个微小变化就能显著影响质量属性的架构参数",
      "权衡点：同时以相反方向影响两个或多个质量属性的架构决策",
      "风险与非风险：根据产生负面结果的可能性对架构决策进行分类",
      "架构方法：用于满足质量需求的特定设计策略或模式"
    ],
    "timeline": [
      [
        "1998",
        "Initial ATAM concepts developed at the Software Engineering Institute (SEI)"
      ],
      [
        "2000",
        "SEI publishes the formal ATAM technical report"
      ],
      [
        "2003",
        "Kazman, Bass, and Clements publish 'Software Architecture in Practice' covering ATAM in depth"
      ],
      [
        "2005",
        "ATAM becomes an industry standard for architecture evaluation in defense and aerospace"
      ],
      [
        "2012",
        "Lightweight variants of ATAM emerge for agile teams needing faster evaluation cycles"
      ]
    ],
    "timeline_zh": [
      [
        "1998",
        "软件工程研究所（SEI）开发ATAM的初始概念"
      ],
      [
        "2000",
        "SEI发布ATAM正式技术报告"
      ],
      [
        "2003",
        "Kazman、Bass和Clements出版《软件架构实践》深入介绍ATAM"
      ],
      [
        "2005",
        "ATAM成为国防和航空航天领域架构评估的行业标准"
      ],
      [
        "2012",
        "针对敏捷团队需要更快评估周期的轻量级ATAM变体出现"
      ]
    ],
    "dos": [
      "Do involve a broad set of stakeholders because hidden quality concerns surface only when diverse perspectives are present",
      "Do build the utility tree collaboratively because stakeholder buy-in on scenario priorities is critical",
      "Do schedule ATAM early in the lifecycle because it is most cost-effective before major implementation decisions are locked in",
      "Do document all identified risks even if they seem unlikely because they form a valuable risk registry"
    ],
    "dos_zh": [
      "邀请广泛的利益相关者参与，因为隐藏的质量关切只有在多元视角汇聚时才会浮现",
      "协作构建效用树，因为利益相关者对场景优先级的认同至关重要",
      "在生命周期早期安排ATAM，因为在主要实现决策锁定之前最具成本效益",
      "记录所有已识别的风险，即使看似不太可能发生，因为它们构成了有价值的风险登记册"
    ],
    "donts": [
      "Don't skip stakeholder scenario brainstorming because the evaluation team cannot anticipate all quality concerns alone",
      "Don't treat ATAM as a pass/fail audit because its purpose is to surface risks and trade-offs, not to judge",
      "Don't run ATAM on an architecture that hasn't been documented because the method requires concrete views to analyze",
      "Don't rush the utility tree prioritization because incorrect priorities lead to analyzing the wrong scenarios"
    ],
    "donts_zh": [
      "不要跳过利益相关者的场景头脑风暴，因为评估团队无法独自预见所有质量关切",
      "不要将ATAM视为通过/不通过的审计，其目的是揭示风险和权衡而非评判",
      "不要对尚未文档化的架构运行ATAM，因为该方法需要具体的视图来分析",
      "不要仓促进行效用树优先级排序，因为错误的优先级会导致分析错误的场景"
    ],
    "case_study_company": "NASA Jet Propulsion Laboratory",
    "case_study": "NASA JPL used ATAM to evaluate the architecture of the Mars Rover ground control system before the Curiosity mission. The evaluation identified critical trade-offs between real-time telemetry processing performance and fault tolerance. By surfacing these risks early, JPL redesigned the data pipeline to support graceful degradation, which proved essential when communication delays exceeded expected parameters during landing.",
    "case_study_zh": "NASA喷气推进实验室在好奇号火星车任务之前使用ATAM评估地面控制系统架构。评估识别出实时遥测处理性能与容错性之间的关键权衡。通过及早发现这些风险，JPL重新设计了数据管道以支持优雅降级，这在着陆期间通信延迟超出预期参数时证明至关重要。",
    "when_not_to_use": [
      "Small projects where the cost of a full ATAM evaluation exceeds the project budget",
      "Early-stage startups where requirements change too rapidly for formal evaluation to remain relevant",
      "Systems with a single dominant quality attribute where trade-off analysis adds little value"
    ],
    "when_not_to_use_zh": [
      "小型项目中完整ATAM评估的成本超过项目预算时",
      "需求变化过快、正式评估难以保持相关性的早期创业公司",
      "只有单一主导质量属性、权衡分析附加价值有限的系统"
    ],
    "adopters": [
      "NASA",
      "Lockheed Martin",
      "Philips Healthcare",
      "Raytheon",
      "Boeing"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "performance"
    ],
    "maturity_ring": "established",
    "primary_source": "Rick Kazman, Mark Klein, and Paul Clements (2000). \"ATAM: Method for Architecture Evaluation\". SEI Technical Report CMU/SEI-2000-TR-004.",
    "secondary_sources": [
      "Len Bass, Paul Clements, and Rick Kazman (2003). \"Software Architecture in Practice, 2nd Edition\". Addison-Wesley.",
      "Rick Kazman et al. (1998). \"The Architecture Tradeoff Analysis Method\". IEEE ICECCS 1998."
    ],
    "typed_relations": [
      {
        "slug": "qaw",
        "type": "prerequisite"
      },
      {
        "slug": "trade-off-sliders",
        "type": "complement"
      },
      {
        "slug": "adr",
        "type": "complement"
      }
    ]
  },
  {
    "id": 17,
    "name": "CAP Theorem",
    "name_zh": "CAP定理",
    "slug": "cap-theorem",
    "category": "architecture",
    "desc": "Distributed systems can guarantee only 2 of 3: C, A, P",
    "desc_zh": "分布式系统只能同时保证一致性、可用性、分区容错性中的两个",
    "steps": [
      "Identify the distributed system's core requirements: what data consistency guarantees are needed by clients",
      "Determine realistic network partition scenarios and their likelihood in your deployment environment",
      "Choose the CP or AP trade-off: decide whether consistency or availability is sacrificed during partition events",
      "Select data stores, protocols, and replication strategies that implement the chosen trade-off (e.g., etcd for CP, Cassandra for AP)",
      "Design compensating mechanisms: conflict resolution, eventual consistency patterns, or client retry logic to handle the chosen trade-off"
    ],
    "steps_zh": [
      "明确分布式系统的核心需求：客户端需要什么级别的数据一致性保证",
      "评估网络分区的实际场景及其在当前部署环境中的发生概率",
      "选择CP或AP权衡：决定在分区事件发生时牺牲一致性还是可用性",
      "选择实现该权衡的数据存储、协议和复制策略（如CP选etcd，AP选Cassandra）",
      "设计补偿机制：冲突解决、最终一致性模式或客户端重试逻辑来应对所选权衡"
    ],
    "ai_relevant": false,
    "viz_type": "venn",
    "viz_labels": [
      "Consistency",
      "Availability",
      "Partition Tolerance"
    ],
    "viz_labels_zh": [
      "一致性",
      "可用性",
      "分区容错"
    ],
    "related": [
      "cqrs-pattern",
      "saga-pattern",
      "eda"
    ],
    "tags": [
      "distributed-systems",
      "consistency",
      "availability",
      "partition-tolerance"
    ],
    "origin_author": "Eric Brewer, 2000",
    "origin_source": "Towards Robust Distributed Systems (keynote at ACM PODC 2000); formally proved by Gilbert and Lynch, 2002",
    "origin_source_zh": "《迈向健壮的分布式系统》（2000年ACM PODC主题演讲）；由Gilbert和Lynch于2002年正式证明",
    "complexity": "intermediate",
    "when_to_use": [
      "When designing a distributed database or data store and choosing replication strategy",
      "When evaluating the behavior of a system during network failures or partitions",
      "When deciding between strong consistency and high availability for a distributed service",
      "When selecting between CP systems like ZooKeeper and AP systems like Cassandra"
    ],
    "when_to_use_zh": [
      "设计分布式数据库或数据存储并选择复制策略时",
      "评估系统在网络故障或分区期间的行为时",
      "在分布式服务中决定强一致性与高可用性之间的取舍时",
      "在ZooKeeper等CP系统和Cassandra等AP系统之间做选择时"
    ],
    "core_concepts": [
      "Consistency: Every read receives the most recent write or an error across all nodes",
      "Availability: Every request receives a non-error response, without guarantee it contains the most recent write",
      "Partition tolerance: The system continues operating despite arbitrary message loss or delay between nodes",
      "PACELC extension: When partitioned choose A or C; Else (no partition) choose between Latency and Consistency",
      "Eventual consistency: A relaxed consistency model where all replicas converge to the same state over time"
    ],
    "core_concepts_zh": [
      "一致性：每次读取都能获得所有节点上最近的写入结果或返回错误",
      "可用性：每个请求都能收到非错误响应，但不保证包含最新写入",
      "分区容错性：系统在节点间任意消息丢失或延迟的情况下继续运行",
      "PACELC扩展：分区时选择A或C；无分区时在延迟和一致性之间选择",
      "最终一致性：一种宽松的一致性模型，所有副本随时间收敛到相同状态"
    ],
    "timeline": [
      [
        "2000",
        "Eric Brewer presents the CAP conjecture at ACM PODC symposium"
      ],
      [
        "2002",
        "Seth Gilbert and Nancy Lynch formally prove the CAP theorem at MIT"
      ],
      [
        "2007",
        "Amazon publishes the Dynamo paper, demonstrating AP design at scale"
      ],
      [
        "2012",
        "Eric Brewer publishes 'CAP Twelve Years Later' clarifying common misconceptions"
      ],
      [
        "2015",
        "Martin Kleppmann critiques CAP and proposes more nuanced consistency models"
      ]
    ],
    "timeline_zh": [
      [
        "2000",
        "Eric Brewer在ACM PODC研讨会上提出CAP猜想"
      ],
      [
        "2002",
        "Seth Gilbert和Nancy Lynch在MIT正式证明CAP定理"
      ],
      [
        "2007",
        "Amazon发布Dynamo论文，展示大规模AP设计"
      ],
      [
        "2012",
        "Eric Brewer发表《CAP定理十二年后》澄清常见误解"
      ],
      [
        "2015",
        "Martin Kleppmann对CAP提出批评，提出更细致的一致性模型"
      ]
    ],
    "dos": [
      "Do treat CAP as a spectrum rather than a binary choice because real systems can tune consistency levels per operation",
      "Do consider the PACELC extension because it covers the common case where no partition exists",
      "Do design for partition tolerance first because network partitions are inevitable in distributed systems",
      "Do use conflict resolution strategies like vector clocks or CRDTs when choosing AP designs"
    ],
    "dos_zh": [
      "将CAP视为一个频谱而非二元选择，因为真实系统可以按操作调整一致性级别",
      "考虑PACELC扩展，因为它涵盖了无分区存在的常见情况",
      "优先为分区容错性设计，因为网络分区在分布式系统中不可避免",
      "选择AP设计时使用向量时钟或CRDT等冲突解决策略"
    ],
    "donts": [
      "Don't interpret CAP as 'pick exactly two' because partition tolerance is not optional in real distributed systems",
      "Don't assume CAP applies to single-node systems because it only describes distributed system behavior during partitions",
      "Don't ignore the latency dimension because CAP doesn't address the consistency-latency trade-off in normal operations",
      "Don't apply CAP to classify systems statically because the same system may behave as CP or AP depending on configuration"
    ],
    "donts_zh": [
      "不要将CAP理解为「三选二」，因为分区容错性在真实分布式系统中不可选择放弃",
      "不要假设CAP适用于单节点系统，它仅描述分布式系统在分区期间的行为",
      "不要忽略延迟维度，因为CAP未涉及正常运行中一致性与延迟的权衡",
      "不要静态地用CAP对系统进行分类，因为同一系统可能根据配置表现为CP或AP"
    ],
    "case_study_company": "Amazon DynamoDB",
    "case_study": "Amazon designed DynamoDB as an AP system to ensure the shopping cart is always available, even during network partitions. During partition events, DynamoDB accepts conflicting writes and resolves them later using vector clocks and application-level reconciliation. This design choice directly supported Amazon's business requirement that a customer should never see an error when adding items to their cart, even if it means showing slightly stale data.",
    "case_study_zh": "Amazon将DynamoDB设计为AP系统，确保购物车在网络分区期间始终可用。在分区事件中，DynamoDB接受冲突写入，随后使用向量时钟和应用层协调来解决冲突。这一设计选择直接支持了Amazon的业务需求：客户在向购物车添加商品时永远不应看到错误，即使这意味着显示略微过时的数据。",
    "when_not_to_use": [
      "Single-node or non-distributed systems where partition tolerance is irrelevant",
      "Systems where you need a nuanced consistency model beyond the binary CP/AP classification",
      "Real-time financial transaction systems that require linearizable consistency without compromise"
    ],
    "when_not_to_use_zh": [
      "分区容错性无关紧要的单节点或非分布式系统",
      "需要超出CP/AP二元分类的细致一致性模型的系统",
      "要求线性化一致性且不能妥协的实时金融交易系统"
    ],
    "adopters": [
      "Amazon",
      "Apache Cassandra",
      "Google Spanner",
      "CockroachDB",
      "Riak"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Eric Brewer (2000). \"Towards Robust Distributed Systems\". ACM PODC Keynote.",
    "secondary_sources": [
      "Seth Gilbert and Nancy Lynch (2002). \"Brewer's Conjecture and the Feasibility of Consistent, Available, Partition-Tolerant Web Services\". ACM SIGACT News, 33(2).",
      "Martin Kleppmann (2017). \"Designing Data-Intensive Applications\". O'Reilly Media. Chapter 9."
    ],
    "typed_relations": [
      {
        "slug": "cqrs-pattern",
        "type": "complement"
      },
      {
        "slug": "saga-pattern",
        "type": "complement"
      },
      {
        "slug": "eda",
        "type": "complement"
      }
    ]
  },
  {
    "id": 18,
    "name": "C4 Model",
    "name_zh": "C4模型",
    "slug": "c4-model",
    "category": "architecture",
    "desc": "Four-level hierarchy for visualizing software architecture",
    "desc_zh": "用于可视化软件架构的四层次模型",
    "steps": [
      "Draw a Context diagram: show the system as a box, its users, and external systems it interacts with",
      "Drill into a Container diagram: decompose the system into containers (apps, services, databases) with technology choices",
      "Create Component diagrams for key containers: show the internal components and how they collaborate",
      "Optionally add Code-level diagrams (UML class diagrams) for the most critical components",
      "Embed diagrams in documentation and ADRs; keep them as living artifacts updated with each architecture change"
    ],
    "steps_zh": [
      "绘制上下文图：将系统表示为方框，展示其用户和交互的外部系统",
      "深入绘制容器图：将系统分解为容器（应用、服务、数据库）并标注技术选型",
      "为关键容器创建组件图：展示内部组件及其协作关系",
      "可选地为最关键组件添加代码层级图（UML类图）",
      "将图表嵌入文档和ADR，作为随每次架构变更持续更新的活文档"
    ],
    "ai_relevant": true,
    "viz_type": "tree",
    "viz_labels": [
      "System Context",
      "Container",
      "Component",
      "Code"
    ],
    "viz_labels_zh": [
      "系统上下文",
      "容器",
      "组件",
      "代码"
    ],
    "related": [
      "adr",
      "togaf-adm",
      "microservices-decomposition"
    ],
    "tags": [
      "visualization",
      "diagrams",
      "architecture-views",
      "documentation"
    ],
    "origin_author": "Simon Brown, 2011",
    "origin_source": "Software Architecture for Developers (Leanpub book) and c4model.com",
    "origin_source_zh": "《面向开发者的软件架构》（Leanpub书籍）及c4model.com",
    "complexity": "beginner",
    "when_to_use": [
      "When communicating software architecture to both technical and non-technical stakeholders",
      "When onboarding new developers who need to understand system structure at multiple levels",
      "When creating architecture documentation that stays maintainable and up-to-date",
      "When replacing ad-hoc whiteboard diagrams with a consistent, hierarchical visualization approach"
    ],
    "when_to_use_zh": [
      "向技术和非技术利益相关者传达软件架构时",
      "新开发者入职需要在多个层次理解系统结构时",
      "创建可维护且保持最新的架构文档时",
      "用一致的层级化可视化方法替代随意的白板图时"
    ],
    "core_concepts": [
      "Context diagram: The highest level showing the system boundary, users, and external dependencies",
      "Container diagram: Major runtime units (applications, databases, message queues) with technology labels",
      "Component diagram: Internal building blocks within a container showing responsibilities and relationships",
      "Code diagram: Lowest level showing classes or modules, typically auto-generated from code",
      "Abstraction hierarchy: Each level zooms into the previous one, like a map with increasing detail"
    ],
    "core_concepts_zh": [
      "上下文图：最高层级，展示系统边界、用户和外部依赖",
      "容器图：主要的运行时单元（应用、数据库、消息队列），标注技术选型",
      "组件图：容器内部的构建模块，展示职责和关系",
      "代码图：最低层级，展示类或模块，通常从代码自动生成",
      "抽象层级：每个层级放大前一个层级的细节，类似不同比例的地图"
    ],
    "timeline": [
      [
        "2006",
        "Simon Brown begins developing the C4 approach while consulting at various organizations"
      ],
      [
        "2011",
        "C4 Model formally published as part of 'Software Architecture for Developers'"
      ],
      [
        "2018",
        "Structurizr tooling released for diagram-as-code C4 modeling"
      ],
      [
        "2020",
        "C4-PlantUML integration brings C4 to the PlantUML ecosystem"
      ],
      [
        "2023",
        "C4 Model becomes the de facto standard for software architecture diagramming in many organizations"
      ]
    ],
    "timeline_zh": [
      [
        "2006",
        "Simon Brown在多个组织咨询期间开始开发C4方法"
      ],
      [
        "2011",
        "C4模型作为《面向开发者的软件架构》的一部分正式发布"
      ],
      [
        "2018",
        "Structurizr工具发布，支持以代码方式创建C4模型图"
      ],
      [
        "2020",
        "C4-PlantUML集成将C4引入PlantUML生态系统"
      ],
      [
        "2023",
        "C4模型成为众多组织中软件架构图绘制的事实标准"
      ]
    ],
    "dos": [
      "Do start with Context diagrams because they provide the broadest overview and are accessible to all audiences",
      "Do add a legend and clear labels to every diagram because C4 relies on explicitness over convention",
      "Do keep diagrams up to date as architecture evolves because stale diagrams erode trust in documentation",
      "Do use diagram-as-code tools like Structurizr because they integrate with version control and CI pipelines"
    ],
    "dos_zh": [
      "从上下文图开始，因为它提供最广泛的概览，所有受众都能理解",
      "为每张图添加图例和清晰标注，因为C4依赖明确性而非惯例",
      "随架构演进保持图表更新，因为过时的图表会削弱对文档的信任",
      "使用Structurizr等图表即代码工具，因为它们可与版本控制和CI管道集成"
    ],
    "donts": [
      "Don't create Code-level diagrams for every component because they become stale quickly and add maintenance burden",
      "Don't mix abstraction levels in a single diagram because it confuses the audience about scope",
      "Don't use C4 as a replacement for UML in all cases because some domains need more formal behavioral diagrams",
      "Don't overload diagrams with too many elements because readability drops sharply beyond 15-20 elements per diagram"
    ],
    "donts_zh": [
      "不要为每个组件都创建代码级图表，因为它们很快会过时并增加维护负担",
      "不要在单张图中混合不同抽象层级，因为这会混淆受众对范围的理解",
      "不要在所有情况下都用C4替代UML，某些领域需要更正式的行为图",
      "不要在图表中放入过多元素，超过15-20个元素时可读性会急剧下降"
    ],
    "case_study_company": "ING Bank",
    "case_study": "ING Bank adopted the C4 Model to standardize architecture documentation across hundreds of development teams during their large-scale agile transformation. By mandating Context and Container diagrams for every service, they created a navigable catalog of their entire technology landscape. This enabled cross-team dependency analysis and reduced integration issues by giving teams a shared vocabulary for discussing system boundaries.",
    "case_study_zh": "ING银行在大规模敏捷转型中采用C4模型，标准化数百个开发团队的架构文档。通过要求每个服务都提供上下文图和容器图，他们创建了整个技术全景的可导航目录。这使跨团队依赖分析成为可能，并通过为团队提供讨论系统边界的共享词汇来减少集成问题。",
    "when_not_to_use": [
      "Hardware-centric systems that need electrical or physical architecture diagrams rather than software views",
      "Formal model-driven development where UML with code generation is required",
      "Very small projects where a single whiteboard sketch suffices and maintaining multiple diagram levels is overhead"
    ],
    "when_not_to_use_zh": [
      "需要电气或物理架构图而非软件视图的硬件中心系统",
      "需要UML配合代码生成的正式模型驱动开发",
      "单张白板草图即可满足需求、维护多层图表反而增加开销的极小项目"
    ],
    "adopters": [
      "ING Bank",
      "Zalando",
      "Elastic",
      "Swiss Re",
      "Structurizr"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Simon Brown (2018). \"The C4 Model for Visualising Software Architecture\". c4model.com.",
    "secondary_sources": [
      "Simon Brown (2018). \"Software Architecture for Developers, Volume 2\". Leanpub.",
      "Simon Brown (2011). \"Software Architecture for Developers\". Leanpub."
    ],
    "typed_relations": [
      {
        "slug": "adr",
        "type": "complement"
      },
      {
        "slug": "togaf-adm",
        "type": "alternative"
      },
      {
        "slug": "microservices-decomposition",
        "type": "complement"
      }
    ]
  },
  {
    "id": 19,
    "name": "CQRS Pattern",
    "name_zh": "命令查询职责分离模式",
    "slug": "cqrs-pattern",
    "category": "architecture",
    "desc": "Separate read/write models for optimized scalability",
    "desc_zh": "分离读写模型以实现优化的可扩展性",
    "steps": [
      "Identify domains where read and write workloads have fundamentally different performance, scaling, or consistency requirements",
      "Design the Command model: create a write-optimized domain model that enforces invariants and emits domain events on state changes",
      "Design the Query model: build read-optimized projections (denormalized views, materialized tables) tailored to specific query patterns",
      "Connect models via events: use domain events or a message bus to asynchronously update read projections when the write model changes",
      "Handle eventual consistency: implement strategies for stale-read tolerance in the UI, version stamps, and read-your-own-writes where needed"
    ],
    "steps_zh": [
      "识别读写负载在性能、扩展或一致性需求上有本质差异的领域",
      "设计命令模型：创建写优化的领域模型，强制执行不变式并在状态变更时发出领域事件",
      "设计查询模型：构建针对特定查询模式优化的读视图（非规范化视图、物化表）",
      "通过事件连接模型：使用领域事件或消息总线在写模型变更时异步更新读投影",
      "处理最终一致性：为UI中的陈旧读取容忍度、版本戳和必要时的读己之写实现策略"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Command",
      "Query",
      "Write Model",
      "Read Model"
    ],
    "viz_labels_zh": [
      "命令",
      "查询",
      "写模型",
      "读模型"
    ],
    "related": [
      "event-sourcing-pattern",
      "eda",
      "domain-driven-design",
      "saga-pattern"
    ],
    "tags": [
      "cqrs",
      "read-write-separation",
      "scalability",
      "event-driven"
    ],
    "origin_author": "Greg Young, 2010",
    "origin_source": "CQRS Documents (Greg Young, 2010); builds on Bertrand Meyer's Command-Query Separation principle from 1988",
    "origin_source_zh": "《CQRS文档》（Greg Young，2010）；基于Bertrand Meyer 1988年提出的命令查询分离原则",
    "complexity": "advanced",
    "when_to_use": [
      "When read and write workloads have vastly different scaling requirements",
      "When complex domain logic on the write side doesn't align with the data shapes needed for queries",
      "When you need highly optimized read models for dashboards, reports, or search-heavy features",
      "When combined with Event Sourcing to build audit trails and temporal queries"
    ],
    "when_to_use_zh": [
      "读写工作负载具有截然不同的扩展需求时",
      "写侧的复杂领域逻辑与查询所需的数据形态不一致时",
      "需要为仪表盘、报表或大量搜索功能构建高度优化的读模型时",
      "与事件溯源结合构建审计追踪和时间查询时"
    ],
    "core_concepts": [
      "Command model: Write-optimized model focused on enforcing business rules and maintaining invariants",
      "Query model: Read-optimized projections denormalized for specific UI or API consumption patterns",
      "Eventual consistency: Read models are asynchronously updated and may lag behind the write model",
      "Event propagation: Domain events bridge the write and read sides, serving as the synchronization mechanism",
      "Task-based UI: User interfaces designed around commands (actions) rather than CRUD operations"
    ],
    "core_concepts_zh": [
      "命令模型：面向写优化的模型，专注于执行业务规则和维护不变式",
      "查询模型：为特定UI或API消费模式进行非规范化的读优化投影",
      "最终一致性：读模型异步更新，可能滞后于写模型",
      "事件传播：领域事件连接写侧和读侧，作为同步机制",
      "基于任务的UI：围绕命令（动作）而非CRUD操作设计的用户界面"
    ],
    "timeline": [
      [
        "1988",
        "Bertrand Meyer introduces Command-Query Separation (CQS) in 'Object-Oriented Software Construction'"
      ],
      [
        "2010",
        "Greg Young formalizes CQRS as an architectural pattern and publishes CQRS Documents"
      ],
      [
        "2011",
        "Microsoft publishes the CQRS Journey guidance project on Azure"
      ],
      [
        "2014",
        "Axon Framework released for Java, providing first-class CQRS and Event Sourcing support"
      ],
      [
        "2019",
        "CQRS patterns become mainstream in cloud-native and event-driven microservices architectures"
      ]
    ],
    "timeline_zh": [
      [
        "1988",
        "Bertrand Meyer在《面向对象软件构造》中引入命令查询分离（CQS）"
      ],
      [
        "2010",
        "Greg Young将CQRS形式化为架构模式并发布CQRS文档"
      ],
      [
        "2011",
        "Microsoft发布基于Azure的CQRS Journey指导项目"
      ],
      [
        "2014",
        "Axon Framework发布Java版本，提供CQRS和事件溯源一等支持"
      ],
      [
        "2019",
        "CQRS模式在云原生和事件驱动微服务架构中成为主流"
      ]
    ],
    "dos": [
      "Do start with a single database and logical separation before introducing physical read/write stores because premature splitting adds complexity",
      "Do design read models for specific use cases because one-size-fits-all projections defeat the purpose of CQRS",
      "Do implement idempotent event handlers because events may be delivered more than once",
      "Do combine with Event Sourcing when you need a complete audit trail and temporal query capabilities"
    ],
    "dos_zh": [
      "先从单一数据库和逻辑分离开始，再引入物理读写分离，因为过早拆分增加复杂性",
      "为特定用例设计读模型，因为通用投影违背了CQRS的初衷",
      "实现幂等的事件处理器，因为事件可能被多次投递",
      "需要完整审计追踪和时间查询能力时，与事件溯源结合使用"
    ],
    "donts": [
      "Don't apply CQRS to every bounded context because most CRUD-heavy modules don't benefit from the added complexity",
      "Don't ignore eventual consistency in the UI because users will be confused if they write data and don't see it immediately",
      "Don't build synchronous projections because it couples the write and read paths, negating CQRS benefits",
      "Don't skip monitoring the projection lag because undetected delays can cause significant user experience issues"
    ],
    "donts_zh": [
      "不要对每个限界上下文都应用CQRS，大多数以CRUD为主的模块不会从额外复杂性中获益",
      "不要在UI中忽视最终一致性，用户写入数据后若不能立即看到会感到困惑",
      "不要构建同步投影，因为这会耦合读写路径，抵消CQRS的好处",
      "不要忽略投影延迟的监控，未被发现的延迟可能导致严重的用户体验问题"
    ],
    "case_study_company": "Microsoft Azure DevOps",
    "case_study": "Microsoft applied CQRS in Azure DevOps (formerly VSTS) to handle the extreme asymmetry between read and write workloads in work item tracking. The write model enforces complex workflow rules and permissions, while multiple denormalized read models serve dashboards, queries, and reports. This separation allowed the read side to scale independently to handle millions of queries per minute during peak usage without impacting the consistency guarantees of the write path.",
    "case_study_zh": "Microsoft在Azure DevOps（原VSTS）中应用CQRS来处理工作项跟踪中读写工作负载的极端不对称。写模型执行复杂的工作流规则和权限控制，而多个非规范化读模型服务于仪表盘、查询和报表。这种分离使读侧能够独立扩展，在峰值使用期间处理每分钟数百万次查询，而不影响写路径的一致性保证。",
    "when_not_to_use": [
      "Simple CRUD applications where read and write models are nearly identical",
      "Small-scale systems where the operational overhead of maintaining two models exceeds the benefits",
      "Domains where strong real-time consistency between reads and writes is a hard requirement"
    ],
    "when_not_to_use_zh": [
      "读写模型几乎相同的简单CRUD应用",
      "维护两套模型的运维开销超过收益的小规模系统",
      "读写之间强实时一致性是硬性要求的领域"
    ],
    "adopters": [
      "Microsoft",
      "Axon",
      "EventStore",
      "Walmart",
      "Booking.com"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "performance",
      "scalability"
    ],
    "maturity_ring": "established",
    "primary_source": "Greg Young (2010). \"CQRS Documents\". cqrs.files.wordpress.com.",
    "secondary_sources": [
      "Bertrand Meyer (1988). \"Object-Oriented Software Construction\". Prentice Hall. Command-Query Separation principle.",
      "Martin Fowler (2011). \"CQRS\". martinfowler.com."
    ],
    "typed_relations": [
      {
        "slug": "event-sourcing-pattern",
        "type": "complement"
      },
      {
        "slug": "eda",
        "type": "prerequisite"
      },
      {
        "slug": "domain-driven-design",
        "type": "prerequisite"
      },
      {
        "slug": "saga-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 20,
    "name": "Event-Driven Architecture (EDA)",
    "name_zh": "事件驱动架构",
    "slug": "eda",
    "category": "architecture",
    "desc": "Systems communicate via asynchronous events for loose coupling",
    "desc_zh": "系统通过异步事件通信，实现松耦合",
    "steps": [
      "Identify domain events: meaningful state changes that other parts of the system need to react to",
      "Design event schemas with versioning strategy; choose event broker (Kafka, RabbitMQ, EventBridge)",
      "Implement event producers that publish events on state change and consumers that subscribe to relevant topics",
      "Define event choreography vs. orchestration: decide where coordination logic lives (in events vs. a central saga)",
      "Establish observability: event tracing, dead-letter queues, idempotency keys, and replay capabilities for resilience"
    ],
    "steps_zh": [
      "识别领域事件：系统其他部分需要响应的有意义的状态变更",
      "设计带有版本策略的事件模式；选择事件代理（Kafka、RabbitMQ、EventBridge）",
      "实现在状态变更时发布事件的生产者和订阅相关主题的消费者",
      "定义事件编排与事件协调：决定协调逻辑的归属（事件本身还是中央Saga）",
      "建立可观测性：事件追踪、死信队列、幂等键和回放能力以保障弹性"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Producer",
      "Event Bus",
      "Consumer",
      "Event Store"
    ],
    "viz_labels_zh": [
      "事件生产者",
      "事件总线",
      "事件消费者",
      "事件存储"
    ],
    "related": [
      "cqrs-pattern",
      "event-sourcing-pattern",
      "saga-pattern",
      "microservices-decomposition"
    ],
    "tags": [
      "events",
      "async",
      "messaging",
      "loose-coupling",
      "choreography"
    ],
    "origin_author": "Concept has roots in publish-subscribe systems; popularized by Gregor Hohpe and Bobby Woolf, 2003",
    "origin_source": "Enterprise Integration Patterns (Hohpe & Woolf, 2003)",
    "origin_source_zh": "《企业集成模式》（Hohpe与Woolf，2003年）",
    "complexity": "intermediate",
    "when_to_use": [
      "When multiple services need to react to the same business event independently",
      "When you need to decouple producers from consumers to allow independent deployment and scaling",
      "When building real-time data pipelines that process streams of events",
      "When system components have different processing speeds and need buffering via queues"
    ],
    "when_to_use_zh": [
      "多个服务需要独立响应同一业务事件时",
      "需要将生产者与消费者解耦以支持独立部署和扩展时",
      "构建处理事件流的实时数据管道时",
      "系统组件处理速度不同，需要通过队列进行缓冲时"
    ],
    "core_concepts": [
      "Event: An immutable record of something that happened in the system, carrying relevant data",
      "Event broker: Infrastructure (Kafka, RabbitMQ, SNS/SQS) that routes events from producers to consumers",
      "Choreography: Services react to events autonomously without a central coordinator",
      "Orchestration: A central component directs the flow by sending commands and listening for events",
      "Idempotency: Consumers must handle duplicate event delivery gracefully using idempotency keys"
    ],
    "core_concepts_zh": [
      "事件：系统中发生事情的不可变记录，携带相关数据",
      "事件代理：将事件从生产者路由到消费者的基础设施（Kafka、RabbitMQ、SNS/SQS）",
      "编排：服务自主响应事件，无需中央协调器",
      "协调：中央组件通过发送命令和监听事件来指挥流程",
      "幂等性：消费者必须使用幂等键优雅地处理重复事件投递"
    ],
    "timeline": [
      [
        "1987",
        "Publish-subscribe pattern formalized in early distributed systems research"
      ],
      [
        "2003",
        "Hohpe and Woolf publish 'Enterprise Integration Patterns', codifying event-driven messaging"
      ],
      [
        "2011",
        "Apache Kafka released by LinkedIn, enabling high-throughput event streaming at scale"
      ],
      [
        "2015",
        "AWS launches Lambda and EventBridge, making serverless EDA accessible"
      ],
      [
        "2020",
        "Event-driven architecture becomes the dominant pattern for cloud-native microservices"
      ]
    ],
    "timeline_zh": [
      [
        "1987",
        "发布-订阅模式在早期分布式系统研究中被形式化"
      ],
      [
        "2003",
        "Hohpe和Woolf出版《企业集成模式》，系统化事件驱动消息传递"
      ],
      [
        "2011",
        "Apache Kafka由LinkedIn发布，支持大规模高吞吐量事件流"
      ],
      [
        "2015",
        "AWS推出Lambda和EventBridge，使无服务器EDA变得可及"
      ],
      [
        "2020",
        "事件驱动架构成为云原生微服务的主导模式"
      ]
    ],
    "dos": [
      "Do define a clear event schema registry and versioning strategy because schema evolution is inevitable",
      "Do implement dead-letter queues for every consumer because unprocessable events need a safe landing zone",
      "Do design events as facts about what happened rather than commands because it preserves loose coupling",
      "Do build replay capability because it enables recovery from consumer bugs and supports new projections"
    ],
    "dos_zh": [
      "定义清晰的事件模式注册表和版本策略，因为模式演进不可避免",
      "为每个消费者实现死信队列，因为无法处理的事件需要安全的存放区",
      "将事件设计为已发生事实的记录而非命令，因为这保持了松耦合",
      "构建事件回放能力，因为它支持从消费者缺陷中恢复并支持新投影"
    ],
    "donts": [
      "Don't put too much data in events because large event payloads create coupling and bandwidth issues",
      "Don't assume ordered delivery across partitions because most brokers only guarantee order within a partition",
      "Don't create circular event dependencies because they cause infinite loops and are extremely hard to debug",
      "Don't skip correlation IDs because without them tracing a business transaction across services is nearly impossible"
    ],
    "donts_zh": [
      "不要在事件中放入过多数据，因为过大的事件负载会造成耦合和带宽问题",
      "不要假设跨分区的有序投递，因为大多数代理只保证分区内有序",
      "不要创建循环事件依赖，因为它们会导致无限循环且极难调试",
      "不要省略关联ID，因为没有它们几乎无法在服务间追踪业务事务"
    ],
    "case_study_company": "LinkedIn",
    "case_study": "LinkedIn built Apache Kafka to solve their data integration challenge of connecting hundreds of microservices. Before Kafka, point-to-point integrations created a brittle mesh of dependencies. By moving to an event-driven architecture with Kafka as the central nervous system, LinkedIn processes over 7 trillion messages per day. This architecture enabled features like real-time activity feeds, recommendation engines, and monitoring systems to all consume the same event streams independently.",
    "case_study_zh": "LinkedIn开发了Apache Kafka来解决数百个微服务之间的数据集成挑战。在Kafka之前，点对点集成创建了一个脆弱的依赖网络。通过以Kafka作为中枢神经系统转向事件驱动架构，LinkedIn每天处理超过7万亿条消息。这种架构使实时动态流、推荐引擎和监控系统能够独立消费相同的事件流。",
    "case_study_challenge": "Hundreds of LinkedIn microservices needed to share data, but point-to-point integrations had created a brittle mesh of dependencies. Adding a new consumer meant modifying the producer, and a single slow downstream service could cascade failures across the entire platform.",
    "case_study_challenge_zh": "数百个LinkedIn微服务需要共享数据，但点对点集成已经形成了一张脆弱的依赖网络。新增一个消费者意味着必须修改生产者，而单个下游服务的延迟就能引发全平台的级联故障。",
    "case_study_approach": "LinkedIn's engineering team built Apache Kafka as a unified event backbone — a distributed commit log that decoupled producers from consumers entirely. Every state change became an immutable event on a topic. Activity feeds, recommendation engines, monitoring systems, and data warehouses each subscribed independently, consuming the same streams at their own pace.",
    "case_study_approach_zh": "LinkedIn工程团队构建了Apache Kafka作为统一的事件骨干——一个将生产者与消费者彻底解耦的分布式提交日志。每次状态变更都成为主题上的不可变事件。动态流、推荐引擎、监控系统和数据仓库各自独立订阅，按自己的节奏消费相同的数据流。",
    "case_study_result": "LinkedIn now processes over 7 trillion messages per day through Kafka. New features connect to the event backbone in hours rather than weeks. Kafka itself became the most widely adopted event streaming platform in the industry, powering real-time infrastructure at companies from Netflix to Goldman Sachs.",
    "case_study_result_zh": "LinkedIn如今通过Kafka每天处理超过7万亿条消息。新功能可以在数小时而非数周内接入事件骨干。Kafka本身成为业界采用最广泛的事件流平台，为从Netflix到高盛的众多企业的实时基础设施提供动力。",
    "case_study_quote": "We stopped thinking about data integration and started thinking about a central nervous system. Kafka is LinkedIn's bloodstream.",
    "case_study_quote_zh": "我们不再把它当作数据集成问题，而是开始思考如何构建中枢神经系统。Kafka就是LinkedIn的血液循环。",
    "when_not_to_use": [
      "Simple request-response interactions where synchronous communication is more straightforward",
      "Systems requiring strict real-time consistency where eventual consistency is unacceptable",
      "Small monolithic applications where the overhead of an event broker is unjustified"
    ],
    "when_not_to_use_zh": [
      "同步通信更直接的简单请求-响应交互场景",
      "需要严格实时一致性、最终一致性不可接受的系统",
      "事件代理的开销不合理的小型单体应用"
    ],
    "adopters": [
      "LinkedIn",
      "Netflix",
      "Uber",
      "Airbnb",
      "Shopify"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "scalability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Gregor Hohpe and Bobby Woolf (2003). \"Enterprise Integration Patterns: Designing, Building, and Deploying Messaging Solutions\". Addison-Wesley.",
    "secondary_sources": [
      "Martin Fowler (2017). \"What do you mean by Event-Driven?\". martinfowler.com.",
      "Adam Bellemare (2020). \"Building Event-Driven Microservices\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "cqrs-pattern",
        "type": "complement"
      },
      {
        "slug": "event-sourcing-pattern",
        "type": "complement"
      },
      {
        "slug": "saga-pattern",
        "type": "extends"
      },
      {
        "slug": "microservices-decomposition",
        "type": "complement"
      }
    ]
  },
  {
    "id": 21,
    "name": "Microservices Decomposition Patterns",
    "name_zh": "微服务分解模式",
    "slug": "microservices-decomposition",
    "category": "architecture",
    "desc": "Strategies to split monoliths into focused, independent services",
    "desc_zh": "将单体应用拆分为职责明确的独立服务的策略集合",
    "steps": [
      "Analyze the monolith using domain analysis or dependency graphs to identify natural seams and hot spots",
      "Apply decomposition strategy: Decompose by Business Capability, Subdomain, or Strangler Fig for incremental migration",
      "Define service APIs and contracts first (API-first), then extract service logic and its data store",
      "Implement inter-service communication: synchronous REST/gRPC or asynchronous messaging based on coupling tolerance",
      "Validate each extracted service with independent deployment, monitoring, and rollback before continuing extraction"
    ],
    "steps_zh": [
      "使用领域分析或依赖关系图分析单体应用，识别自然接缝和热点",
      "应用分解策略：按业务能力、子域或绞杀者模式进行增量迁移",
      "先定义服务API和契约（API优先），再提取服务逻辑及其数据存储",
      "根据耦合容忍度选择服务间通信方式：同步REST/gRPC或异步消息",
      "对每个提取的服务进行独立部署、监控和回滚验证，再继续后续提取"
    ],
    "ai_relevant": false,
    "viz_type": "tree",
    "viz_labels": [
      "Domain Service",
      "API Gateway",
      "Data Store",
      "Boundary"
    ],
    "viz_labels_zh": [
      "领域服务",
      "API网关",
      "数据存储",
      "服务边界"
    ],
    "related": [
      "domain-driven-design",
      "eda",
      "service-mesh-pattern",
      "strangler-fig-pattern"
    ],
    "tags": [
      "microservices",
      "decomposition",
      "monolith",
      "service-boundaries"
    ],
    "origin_author": "James Lewis and Martin Fowler, 2014",
    "origin_source": "Microservices: a definition of this new architectural term (martinfowler.com, 2014)",
    "origin_source_zh": "《微服务：这一新架构术语的定义》（martinfowler.com，2014年）",
    "complexity": "advanced",
    "when_to_use": [
      "When a monolith's deployment frequency is bottlenecked by team coordination overhead",
      "When different parts of the system have vastly different scaling requirements",
      "When teams need to independently develop, deploy, and scale their owned services",
      "When migrating gradually from a legacy monolith using the Strangler Fig approach"
    ],
    "when_to_use_zh": [
      "单体应用的部署频率因团队协调开销而成为瓶颈时",
      "系统不同部分具有截然不同的扩展需求时",
      "团队需要独立开发、部署和扩展其负责的服务时",
      "使用绞杀者模式从遗留单体逐步迁移时"
    ],
    "core_concepts": [
      "Decompose by Business Capability: Align service boundaries with what the business does (e.g., payments, inventory)",
      "Decompose by Subdomain: Use DDD bounded contexts to find natural service boundaries",
      "Strangler Fig Pattern: Incrementally replace monolith functionality by routing traffic to new services",
      "Database per Service: Each microservice owns its data store to ensure loose coupling",
      "API Gateway: A single entry point that routes, aggregates, and translates client requests to internal services"
    ],
    "core_concepts_zh": [
      "按业务能力分解：将服务边界与业务功能对齐（如支付、库存）",
      "按子域分解：使用DDD限界上下文找到自然的服务边界",
      "绞杀者模式：通过将流量路由到新服务来增量替换单体功能",
      "服务独占数据库：每个微服务拥有自己的数据存储以确保松耦合",
      "API网关：路由、聚合和转换客户端请求到内部服务的单一入口点"
    ],
    "timeline": [
      [
        "2011",
        "Netflix begins its large-scale monolith-to-microservices migration"
      ],
      [
        "2014",
        "James Lewis and Martin Fowler publish the canonical microservices article"
      ],
      [
        "2015",
        "Sam Newman publishes 'Building Microservices', establishing practical patterns"
      ],
      [
        "2018",
        "Service mesh tools (Istio, Linkerd) mature, addressing microservice networking complexity"
      ],
      [
        "2023",
        "Industry recognizes 'right-sizing' services, moving beyond naive fine-grained decomposition"
      ]
    ],
    "timeline_zh": [
      [
        "2011",
        "Netflix开始大规模从单体向微服务迁移"
      ],
      [
        "2014",
        "James Lewis和Martin Fowler发表经典微服务文章"
      ],
      [
        "2015",
        "Sam Newman出版《构建微服务》，建立实践模式"
      ],
      [
        "2018",
        "服务网格工具（Istio、Linkerd）成熟，应对微服务网络复杂性"
      ],
      [
        "2023",
        "业界认识到「合理粒度」服务的重要性，超越简单的细粒度分解"
      ]
    ],
    "dos": [
      "Do start with a well-structured modular monolith before decomposing because clear boundaries make extraction easier",
      "Do define service boundaries using domain-driven design because business alignment prevents arbitrary splits",
      "Do automate deployment pipelines for each service because manual deployment negates microservices benefits",
      "Do implement contract testing between services because integration breakage is the top microservices risk"
    ],
    "dos_zh": [
      "先构建结构良好的模块化单体再进行分解，因为清晰的边界使提取更容易",
      "使用领域驱动设计定义服务边界，因为业务对齐可防止任意拆分",
      "为每个服务自动化部署管道，因为手动部署会抵消微服务的好处",
      "在服务间实现契约测试，因为集成破坏是微服务最大的风险"
    ],
    "donts": [
      "Don't decompose a monolith you don't understand because you'll replicate its problems in distributed form",
      "Don't create nano-services by splitting too finely because excessive network calls destroy performance",
      "Don't share databases between services because it creates hidden coupling that defeats service independence",
      "Don't skip investing in observability because debugging distributed systems without tracing is nearly impossible"
    ],
    "donts_zh": [
      "不要分解你不理解的单体，因为你会以分布式形式复制其问题",
      "不要通过过度细分创建纳米服务，因为过多的网络调用会摧毁性能",
      "不要在服务间共享数据库，因为这会产生破坏服务独立性的隐性耦合",
      "不要跳过可观测性投入，因为没有追踪的分布式系统调试几乎不可能"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix migrated from a monolithic Java application to over 700 microservices between 2011 and 2016. The migration used the Strangler Fig pattern, gradually replacing monolith endpoints with new services. Each service owned its data and communicated via REST and asynchronous messaging. This transformation enabled Netflix to deploy hundreds of times per day, scale individual services based on demand, and achieve 99.99% availability for its streaming platform serving 200+ million subscribers.",
    "case_study_zh": "Netflix在2011至2016年间从单体Java应用迁移到超过700个微服务。迁移使用绞杀者模式，逐步用新服务替换单体端点。每个服务拥有自己的数据，通过REST和异步消息通信。这一转型使Netflix能够每天部署数百次、按需扩展单个服务，并为超过2亿订阅用户的流媒体平台实现99.99%的可用性。",
    "when_not_to_use": [
      "Early-stage startups where the domain model is still being discovered and boundaries will shift",
      "Small teams (fewer than 8 developers) where microservices overhead exceeds coordination benefits",
      "Systems with very tight latency requirements where inter-service network calls add unacceptable delay"
    ],
    "when_not_to_use_zh": [
      "领域模型仍在探索中且边界会持续变化的早期创业公司",
      "微服务开销超过协调收益的小型团队（少于8名开发者）",
      "服务间网络调用会增加不可接受延迟的极低延迟要求系统"
    ],
    "adopters": [
      "Netflix",
      "Amazon",
      "Uber",
      "Spotify",
      "SoundCloud"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "scalability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "James Lewis and Martin Fowler (2014). \"Microservices: A Definition of This New Architectural Term\". martinfowler.com.",
    "secondary_sources": [
      "Sam Newman (2015). \"Building Microservices: Designing Fine-Grained Systems\". O'Reilly Media.",
      "Chris Richardson (2018). \"Microservices Patterns: With Examples in Java\". Manning Publications."
    ],
    "typed_relations": [
      {
        "slug": "domain-driven-design",
        "type": "prerequisite"
      },
      {
        "slug": "eda",
        "type": "complement"
      },
      {
        "slug": "service-mesh-pattern",
        "type": "extends"
      },
      {
        "slug": "strangler-fig-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 22,
    "name": "Saga Pattern",
    "name_zh": "Saga 模式",
    "slug": "saga-pattern",
    "category": "architecture",
    "desc": "Manage distributed transactions via compensating actions",
    "desc_zh": "通过补偿动作管理分布式事务，保障跨服务数据一致性",
    "steps": [
      "Identify the distributed transaction: map all services involved and the sequence of local transactions each must perform",
      "Choose orchestration vs. choreography: use a central saga orchestrator for complex flows, or event-driven choreography for simpler chains",
      "Define compensating actions: for each step, implement a reverse operation that undoes the effect if a later step fails",
      "Implement idempotency: ensure each step and its compensation can be safely retried without side effects using idempotency keys",
      "Add observability and timeout handling: track saga state transitions, set step timeouts, and alert on stuck or failed sagas for manual resolution"
    ],
    "steps_zh": [
      "识别分布式事务：映射所有涉及的服务以及每个服务必须执行的本地事务序列",
      "选择编排与协调模式：复杂流程使用中央Saga编排器，简单链路使用事件驱动的协调模式",
      "定义补偿动作：为每个步骤实现反向操作，在后续步骤失败时撤销该步骤的效果",
      "实现幂等性：使用幂等键确保每个步骤及其补偿操作可安全重试而无副作用",
      "添加可观测性和超时处理：追踪Saga状态转换，设置步骤超时，对卡住或失败的Saga发出告警以便人工处理"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Step",
      "Compensate",
      "Choreograph",
      "Orchestrate"
    ],
    "viz_labels_zh": [
      "事务步骤",
      "补偿操作",
      "编排协调",
      "集中控制"
    ],
    "related": [
      "eda",
      "cqrs-pattern",
      "circuit-breaker-pattern",
      "microservices-decomposition"
    ],
    "tags": [
      "distributed-transactions",
      "compensation",
      "orchestration",
      "choreography",
      "consistency"
    ],
    "origin_author": "Hector Garcia-Molina and Kenneth Salem, 1987",
    "origin_source": "Sagas (ACM SIGMOD Conference, 1987)",
    "origin_source_zh": "《Sagas》（1987年ACM SIGMOD会议论文）",
    "complexity": "advanced",
    "when_to_use": [
      "When a business process spans multiple microservices and requires all-or-nothing semantics",
      "When distributed transactions (2PC) are not feasible due to latency or service autonomy requirements",
      "When you need compensating logic to undo partial work in case of failures",
      "When order processing, booking, or payment flows involve multiple independent services"
    ],
    "when_to_use_zh": [
      "业务流程跨越多个微服务且需要全有或全无语义时",
      "因延迟或服务自治需求导致分布式事务（2PC）不可行时",
      "需要补偿逻辑在故障时撤销部分完成的工作时",
      "订单处理、预订或支付流程涉及多个独立服务时"
    ],
    "core_concepts": [
      "Compensating transaction: A reverse operation that semantically undoes a previously committed local transaction",
      "Orchestration: A central saga coordinator directs each participant and handles failure routing",
      "Choreography: Each service listens for events and decides independently whether to proceed or compensate",
      "Semantic rollback: Unlike database rollback, compensation may not restore exact original state but achieves business equivalence",
      "Saga execution coordinator (SEC): Tracks saga state and ensures all steps complete or all compensations execute"
    ],
    "core_concepts_zh": [
      "补偿事务：在语义上撤销先前已提交的本地事务的反向操作",
      "编排：中央Saga协调器指挥每个参与者并处理故障路由",
      "协调：每个服务监听事件并独立决定继续执行还是进行补偿",
      "语义回滚：与数据库回滚不同，补偿可能不恢复精确原始状态但实现业务等价",
      "Saga执行协调器（SEC）：追踪Saga状态，确保所有步骤完成或所有补偿执行"
    ],
    "timeline": [
      [
        "1987",
        "Garcia-Molina and Salem publish the original Sagas paper at ACM SIGMOD"
      ],
      [
        "2015",
        "Caitie McCaffrey presents on distributed sagas at Strange Loop, reviving interest"
      ],
      [
        "2017",
        "Chris Richardson popularizes sagas in microservices context via microservices.io"
      ],
      [
        "2019",
        "Temporal.io and other orchestration engines provide first-class saga support"
      ],
      [
        "2021",
        "Saga pattern becomes standard practice in event-driven microservices architectures"
      ]
    ],
    "timeline_zh": [
      [
        "1987",
        "Garcia-Molina和Salem在ACM SIGMOD发表原始Sagas论文"
      ],
      [
        "2015",
        "Caitie McCaffrey在Strange Loop演讲分布式Sagas，重新引起关注"
      ],
      [
        "2017",
        "Chris Richardson通过microservices.io在微服务语境中推广Saga模式"
      ],
      [
        "2019",
        "Temporal.io等编排引擎提供一等Saga支持"
      ],
      [
        "2021",
        "Saga模式成为事件驱动微服务架构的标准实践"
      ]
    ],
    "dos": [
      "Do design compensating actions for every forward step before implementation because retrofitting compensations is error-prone",
      "Do make each step and compensation idempotent because network failures will cause retries",
      "Do use a saga state machine to track progress because it simplifies debugging and recovery",
      "Do set timeouts on each saga step because hung steps can block the entire business process"
    ],
    "dos_zh": [
      "在实现之前为每个正向步骤设计补偿动作，因为事后添加补偿容易出错",
      "确保每个步骤及其补偿是幂等的，因为网络故障会导致重试",
      "使用Saga状态机追踪进度，因为这简化了调试和恢复",
      "为每个Saga步骤设置超时，因为挂起的步骤可能阻塞整个业务流程"
    ],
    "donts": [
      "Don't assume compensations will always succeed because they can fail too, requiring retry queues and manual intervention plans",
      "Don't use sagas for operations that can be done in a single database transaction because the complexity is unnecessary",
      "Don't ignore the visibility problem because without saga-level tracing, failures are invisible to operations teams",
      "Don't mix choreography and orchestration in the same saga because it makes the flow extremely hard to reason about"
    ],
    "donts_zh": [
      "不要假设补偿总会成功，因为补偿也可能失败，需要重试队列和人工干预计划",
      "不要对可以在单个数据库事务中完成的操作使用Saga，因为增加的复杂性没有必要",
      "不要忽视可见性问题，因为没有Saga级别的追踪，故障对运维团队是不可见的",
      "不要在同一个Saga中混用编排和协调，因为这会使流程极难推理"
    ],
    "case_study_company": "Uber",
    "case_study": "Uber uses the saga pattern to coordinate its ride-booking flow across multiple microservices including matching, pricing, payment, and driver notification. When a ride is requested, each service executes its local transaction and emits an event. If payment fails after a driver has been matched, a compensating action releases the driver back to the available pool and notifies the rider. Uber's saga orchestrator (built on Cadence, later Temporal) processes millions of ride sagas daily with exactly-once semantics.",
    "case_study_zh": "Uber使用Saga模式协调其跨多个微服务的叫车流程，包括匹配、定价、支付和司机通知。当请求叫车时，每个服务执行本地事务并发出事件。如果在司机匹配后支付失败，补偿动作会将司机释放回可用池并通知乘客。Uber的Saga编排器（基于Cadence构建，后演变为Temporal）每天以精确一次语义处理数百万次叫车Saga。",
    "case_study_challenge": "A single Uber ride touches matching, pricing, payment, and driver notification — four independent microservices that must coordinate as one atomic business transaction. Traditional distributed transactions (two-phase commit) were too slow and too brittle for millions of concurrent rides.",
    "case_study_challenge_zh": "一次Uber叫车涉及匹配、定价、支付和司机通知——四个独立微服务必须协同完成一个原子性业务事务。传统的分布式事务（两阶段提交）对于数百万并发行程而言太慢、太脆弱。",
    "case_study_approach": "Uber implemented orchestrated sagas using Cadence (later evolved into Temporal). Each service executes its local transaction and emits an event. The saga orchestrator tracks the overall state and, when a step fails — say, payment declines after a driver is already matched — triggers compensating actions: releasing the driver back to the available pool and notifying the rider.",
    "case_study_approach_zh": "Uber使用Cadence（后来演进为Temporal）实现了编排式Saga。每个服务执行本地事务并发出事件。Saga编排器追踪整体状态，当某步骤失败时——例如司机已匹配但支付被拒——触发补偿操作：将司机释放回可用池并通知乘客。",
    "case_study_result": "Uber's saga orchestrator processes millions of ride sagas daily with exactly-once semantics. The compensating action pattern ensures that partial failures never leave the system in an inconsistent state — no phantom charges, no ghost rides, no stranded drivers.",
    "case_study_result_zh": "Uber的Saga编排器每天以精确一次语义处理数百万次行程Saga。补偿操作模式确保部分失败永远不会使系统处于不一致状态——没有幽灵扣款、没有虚假行程、没有滞留的司机。",
    "case_study_quote": "Every ride is a saga. The question is never whether something will fail — it's whether your system knows how to undo gracefully.",
    "case_study_quote_zh": "每次行程都是一个Saga。问题从来不是是否会失败——而是你的系统是否知道如何优雅地回滚。",
    "when_not_to_use": [
      "Operations that can fit within a single ACID transaction boundary",
      "Systems where the business can tolerate partial completion without compensation",
      "Extremely low-latency paths where saga coordination overhead is unacceptable"
    ],
    "when_not_to_use_zh": [
      "可以在单个ACID事务边界内完成的操作",
      "业务可以容忍部分完成而无需补偿的系统",
      "Saga协调开销不可接受的极低延迟路径"
    ],
    "adopters": [
      "Uber",
      "Airbnb",
      "Stripe",
      "Booking.com",
      "DoorDash"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Hector Garcia-Molina and Kenneth Salem (1987). \"Sagas\". ACM SIGMOD Conference.",
    "secondary_sources": [
      "Chris Richardson (2018). \"Microservices Patterns: With Examples in Java\". Manning Publications. Chapter 4.",
      "Caitie McCaffrey (2015). \"Applying the Saga Pattern\". Strange Loop Conference."
    ],
    "typed_relations": [
      {
        "slug": "eda",
        "type": "prerequisite"
      },
      {
        "slug": "cqrs-pattern",
        "type": "complement"
      },
      {
        "slug": "circuit-breaker-pattern",
        "type": "complement"
      },
      {
        "slug": "microservices-decomposition",
        "type": "complement"
      }
    ]
  },
  {
    "id": 23,
    "name": "TOGAF Architecture Development Method (ADM)",
    "name_zh": "TOGAF架构开发方法",
    "slug": "togaf-adm",
    "category": "architecture",
    "desc": "Iterative enterprise architecture lifecycle with defined phases",
    "desc_zh": "具有明确阶段的迭代式企业架构生命周期方法",
    "steps": [
      "Establish Architecture Vision: define scope, stakeholders, constraints, and high-level target architecture statement",
      "Develop Business, Information Systems (Data + Application), and Technology Architecture baselines and targets",
      "Perform gap analysis between baseline and target for each domain; identify work packages",
      "Create Migration Plan: prioritize work packages, define transition architectures, and roadmap sequencing",
      "Govern implementation: establish Architecture Board oversight, compliance reviews, and update the Architecture Repository"
    ],
    "steps_zh": [
      "建立架构愿景：定义范围、利益相关者、约束条件和高层次目标架构声明",
      "开发业务架构、信息系统架构（数据+应用）和技术架构的基线与目标",
      "对每个领域进行基线与目标的差距分析；识别工作包",
      "制定迁移计划：对工作包排定优先级，定义过渡架构和路线图顺序",
      "治理实施：建立架构委员会监督、合规审查，并更新架构知识库"
    ],
    "ai_relevant": false,
    "viz_type": "cycle",
    "viz_labels": [
      "Preliminary",
      "Vision",
      "Architecture",
      "Migration",
      "Governance"
    ],
    "viz_labels_zh": [
      "预备阶段",
      "架构愿景",
      "架构开发",
      "迁移规划",
      "架构治理"
    ],
    "related": [
      "c4-model",
      "adr",
      "wardley-mapping"
    ],
    "tags": [
      "enterprise-architecture",
      "governance",
      "lifecycle",
      "methodology"
    ],
    "origin_author": "The Open Group, 1995",
    "origin_source": "TOGAF Standard (The Open Group Architecture Framework), first published 1995, current version TOGAF 10",
    "origin_source_zh": "《TOGAF标准》（开放组架构框架），1995年首次发布，当前版本为TOGAF 10",
    "complexity": "advanced",
    "when_to_use": [
      "When an enterprise needs a structured approach to align IT architecture with business strategy",
      "When managing large-scale digital transformation programs across multiple business units",
      "When governance and compliance requirements demand formal architecture review processes",
      "When building an enterprise architecture practice from scratch in a large organization"
    ],
    "when_to_use_zh": [
      "企业需要结构化方法将IT架构与业务战略对齐时",
      "管理跨多个业务单元的大规模数字化转型项目时",
      "治理和合规要求需要正式架构审查流程时",
      "在大型组织中从零开始建立企业架构实践时"
    ],
    "core_concepts": [
      "Architecture Development Method: An iterative cycle of phases from Vision through Migration and Governance",
      "Architecture Repository: A structured store of architecture assets, standards, reference models, and governance records",
      "Architecture Building Blocks: Reusable architecture components that can be combined into solutions",
      "Gap analysis: Systematic comparison of baseline and target architectures to identify required changes",
      "Transition architecture: Intermediate states between current and target architecture that are deployable and valuable"
    ],
    "core_concepts_zh": [
      "架构开发方法：从愿景到迁移和治理的迭代阶段循环",
      "架构知识库：架构资产、标准、参考模型和治理记录的结构化存储",
      "架构构建块：可组合成解决方案的可复用架构组件",
      "差距分析：系统地比较基线和目标架构以识别所需变更",
      "过渡架构：当前架构与目标架构之间可部署且有价值的中间状态"
    ],
    "timeline": [
      [
        "1995",
        "The Open Group publishes the first version of TOGAF based on the US DoD TAFIM framework"
      ],
      [
        "2003",
        "TOGAF 8 introduces the Architecture Development Method (ADM) cycle"
      ],
      [
        "2009",
        "TOGAF 9 released with major restructuring and the content metamodel"
      ],
      [
        "2018",
        "TOGAF 9.2 adds agile and digital transformation guidance"
      ],
      [
        "2022",
        "TOGAF Standard 10th Edition released with modular structure and updated for cloud-era enterprise architecture"
      ]
    ],
    "timeline_zh": [
      [
        "1995",
        "开放组基于美国国防部TAFIM框架发布TOGAF第一版"
      ],
      [
        "2003",
        "TOGAF 8引入架构开发方法（ADM）循环"
      ],
      [
        "2009",
        "TOGAF 9发布，进行重大重构并引入内容元模型"
      ],
      [
        "2018",
        "TOGAF 9.2添加敏捷和数字化转型指导"
      ],
      [
        "2022",
        "TOGAF标准第10版发布，采用模块化结构并针对云时代企业架构更新"
      ]
    ],
    "dos": [
      "Do tailor TOGAF to your organization's size and culture because the full framework is designed to be adapted",
      "Do maintain the Architecture Repository as a living asset because it is the institutional memory of architectural decisions",
      "Do engage business stakeholders early in the Vision phase because IT-only architecture fails to deliver business value",
      "Do use transition architectures because big-bang migrations carry excessive risk"
    ],
    "dos_zh": [
      "根据组织规模和文化裁剪TOGAF，因为完整框架本身就是为适配而设计的",
      "将架构知识库作为活资产维护，因为它是架构决策的组织记忆",
      "在愿景阶段尽早让业务利益相关者参与，因为纯IT架构无法交付业务价值",
      "使用过渡架构，因为大爆炸式迁移承担过高风险"
    ],
    "donts": [
      "Don't apply TOGAF rigidly without tailoring because the overhead will crush agility in smaller organizations",
      "Don't treat ADM phases as a strict waterfall because the method is designed to be iterative and adaptive",
      "Don't create architecture artifacts that nobody reads because documentation without consumers is waste",
      "Don't skip the Requirements Management phase because it anchors all ADM iterations to actual business needs"
    ],
    "donts_zh": [
      "不要未经裁剪就严格应用TOGAF，因为在较小组织中其开销会扼杀敏捷性",
      "不要将ADM阶段视为严格的瀑布模型，因为该方法设计为迭代和自适应的",
      "不要创建无人阅读的架构文档，因为没有消费者的文档是浪费",
      "不要跳过需求管理阶段，因为它将所有ADM迭代锚定于实际业务需求"
    ],
    "case_study_company": "US Department of Defense",
    "case_study": "The US Department of Defense adopted TOGAF as the basis for its enterprise architecture practice across military branches. Using the ADM cycle, DoD created baseline and target architectures for its IT consolidation initiative, reducing data center count from over 2,000 to fewer than 800. The Architecture Repository became the authoritative source for technology standards and reference architectures, enabling interoperability across Army, Navy, and Air Force systems.",
    "case_study_zh": "美国国防部采用TOGAF作为跨军种企业架构实践的基础。使用ADM循环，国防部为其IT整合计划创建了基线和目标架构，将数据中心数量从2000多个减少到不足800个。架构知识库成为技术标准和参考架构的权威来源，实现了陆军、海军和空军系统间的互操作性。",
    "when_not_to_use": [
      "Startups and small companies where lightweight architecture practices like ADRs and C4 suffice",
      "Purely agile product teams that need fast iteration without formal governance gates",
      "Projects with a single system where enterprise-level architecture planning is overkill"
    ],
    "when_not_to_use_zh": [
      "ADR和C4等轻量级架构实践即可满足需求的创业公司和小型企业",
      "需要快速迭代而无需正式治理关卡的纯敏捷产品团队",
      "企业级架构规划过度的单一系统项目"
    ],
    "adopters": [
      "US Department of Defense",
      "Capgemini",
      "Infosys",
      "Accenture",
      "Deutsche Bank"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "The Open Group (1995). \"TOGAF: The Open Group Architecture Framework\". The Open Group.",
    "secondary_sources": [
      "The Open Group (2022). \"TOGAF Standard, 10th Edition\". The Open Group.",
      "Andrew Josey et al. (2011). \"TOGAF Version 9.1\". Van Haren Publishing."
    ],
    "typed_relations": [
      {
        "slug": "c4-model",
        "type": "complement"
      },
      {
        "slug": "adr",
        "type": "complement"
      },
      {
        "slug": "wardley-mapping",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 24,
    "name": "Quality Attribute Workshop (QAW)",
    "name_zh": "质量属性工作坊",
    "slug": "qaw",
    "category": "architecture",
    "desc": "Elicit and prioritize quality attribute requirements collaboratively",
    "desc_zh": "协作式挖掘并优先排序质量属性需求",
    "steps": [
      "Identify and invite key stakeholders representing business, operations, security, and development perspectives",
      "Present the system mission and business context; allow stakeholders to brainstorm quality attribute concerns freely",
      "Structure concerns into quality attribute scenarios using the SEI scenario template (stimulus, environment, response, measure)",
      "Prioritize scenarios using voting or weighted ranking; group into utility tree categories (performance, security, etc.)",
      "Produce a ranked Quality Attribute Scenario backlog to drive architecture decisions and evaluation sessions"
    ],
    "steps_zh": [
      "识别并邀请代表业务、运营、安全和开发视角的关键利益相关者",
      "介绍系统使命和业务背景；允许利益相关者自由头脑风暴质量属性关切",
      "使用SEI场景模板（刺激、环境、响应、度量）将关切结构化为质量属性场景",
      "使用投票或加权排名对场景进行优先级排序；按效用树分类（性能、安全等）",
      "生成有优先级的质量属性场景待办列表，用于驱动架构决策和评估会议"
    ],
    "ai_relevant": false,
    "viz_type": "radar",
    "viz_labels": [
      "Performance",
      "Security",
      "Availability",
      "Modifiability",
      "Testability"
    ],
    "viz_labels_zh": [
      "性能",
      "安全",
      "可用性",
      "可修改性",
      "可测试性"
    ],
    "related": [
      "atam",
      "trade-off-sliders",
      "four-golden-signals"
    ],
    "tags": [
      "quality-attributes",
      "workshop",
      "stakeholders",
      "requirements"
    ],
    "origin_author": "Mario Barbacci, Robert Ellison, et al. / SEI, 2003",
    "origin_source": "Quality Attribute Workshop (QAWs) Third Edition (SEI Technical Report CMU/SEI-2003-TR-016)",
    "origin_source_zh": "《质量属性工作坊（QAWs）第三版》（SEI技术报告CMU/SEI-2003-TR-016）",
    "complexity": "intermediate",
    "when_to_use": [
      "When starting a new project and quality requirements are unclear or unstated",
      "When stakeholders have conflicting views on which quality attributes matter most",
      "When feeding scenarios into an ATAM evaluation session",
      "When transitioning from functional requirements to non-functional architecture design"
    ],
    "when_to_use_zh": [
      "开始新项目且质量需求不明确或未被表述时",
      "利益相关者对哪些质量属性最重要存在分歧时",
      "为ATAM评估会议准备场景输入时",
      "从功能需求过渡到非功能架构设计时"
    ],
    "core_concepts": [
      "Quality attribute scenario: A structured description with stimulus, environment, response, and measurable response measure",
      "Utility tree: A hierarchical grouping of quality attributes into categories with prioritized leaf scenarios",
      "Stakeholder brainstorming: Open elicitation where diverse perspectives surface hidden quality concerns",
      "Scenario prioritization: Voting or ranking to identify the most architecturally significant scenarios",
      "Quality attribute categories: Performance, availability, security, modifiability, usability, and others from the SEI taxonomy"
    ],
    "core_concepts_zh": [
      "质量属性场景：包含刺激、环境、响应和可度量响应度量的结构化描述",
      "效用树：将质量属性按类别层级分组，叶节点为有优先级的场景",
      "利益相关者头脑风暴：开放式挖掘，多元视角揭示隐藏的质量关切",
      "场景优先级排序：通过投票或排名识别架构上最重要的场景",
      "质量属性分类：SEI分类体系中的性能、可用性、安全性、可修改性、可用性等"
    ],
    "timeline": [
      [
        "2000",
        "SEI develops the initial QAW method as a companion to ATAM"
      ],
      [
        "2003",
        "SEI publishes the third edition of the QAW technical report"
      ],
      [
        "2006",
        "QAW becomes widely used in government and defense architecture programs"
      ],
      [
        "2012",
        "Lightweight adaptations of QAW emerge for agile and lean teams"
      ],
      [
        "2018",
        "QAW principles integrated into modern architecture review practices like fitness functions"
      ]
    ],
    "timeline_zh": [
      [
        "2000",
        "SEI开发初始QAW方法作为ATAM的配套工具"
      ],
      [
        "2003",
        "SEI发布QAW技术报告第三版"
      ],
      [
        "2006",
        "QAW在政府和国防架构项目中被广泛使用"
      ],
      [
        "2012",
        "针对敏捷和精益团队的轻量级QAW适配版出现"
      ],
      [
        "2018",
        "QAW原则被整合到适应度函数等现代架构审查实践中"
      ]
    ],
    "dos": [
      "Do invite stakeholders from diverse roles because developers, operators, and business users have different quality priorities",
      "Do use the SEI scenario template rigorously because vague scenarios cannot drive architecture decisions",
      "Do limit the workshop to one day because stakeholder fatigue degrades scenario quality",
      "Do connect QAW output directly to architecture evaluation because scenarios without follow-through are wasted effort"
    ],
    "dos_zh": [
      "邀请不同角色的利益相关者，因为开发者、运维和业务用户有不同的质量优先级",
      "严格使用SEI场景模板，因为模糊的场景无法驱动架构决策",
      "将工作坊限制在一天内，因为利益相关者的疲劳会降低场景质量",
      "将QAW输出直接关联到架构评估，因为没有后续跟进的场景是浪费"
    ],
    "donts": [
      "Don't let a single stakeholder dominate brainstorming because it suppresses important minority perspectives",
      "Don't confuse functional requirements with quality attribute scenarios because QAW focuses on the '-ilities' not features",
      "Don't skip the prioritization step because treating all scenarios equally makes architecture design impossible",
      "Don't conduct QAW without a facilitator because untrained facilitation leads to unfocused and unproductive sessions"
    ],
    "donts_zh": [
      "不要让单个利益相关者主导头脑风暴，因为这会压制重要的少数派视角",
      "不要混淆功能需求与质量属性场景，因为QAW关注的是质量特性而非功能",
      "不要跳过优先级排序步骤，因为平等对待所有场景会使架构设计无从下手",
      "不要在没有引导者的情况下进行QAW，因为未经训练的引导会导致会议失焦且无成效"
    ],
    "case_study_company": "Philips Healthcare",
    "case_study": "Philips Healthcare used QAW when designing the architecture of their patient monitoring platform that serves hospitals worldwide. The workshop surfaced critical quality attributes around real-time data latency (patient vitals must display within 2 seconds), availability (99.999% uptime for ICU monitors), and regulatory compliance (FDA Class II requirements). These prioritized scenarios directly shaped the decision to use a redundant publish-subscribe architecture with local edge processing.",
    "case_study_zh": "飞利浦医疗在设计服务全球医院的患者监护平台架构时使用了QAW。工作坊揭示了关于实时数据延迟（患者体征必须在2秒内显示）、可用性（ICU监护仪99.999%正常运行时间）和法规合规（FDA二类要求）的关键质量属性。这些有优先级的场景直接影响了采用带有本地边缘处理的冗余发布-订阅架构的决策。",
    "when_not_to_use": [
      "Projects where quality requirements are already well-defined through existing SLAs and compliance frameworks",
      "Very small teams where informal conversation achieves the same quality requirement alignment",
      "Rapid prototyping phases where quality attributes will be revisited once the concept is validated"
    ],
    "when_not_to_use_zh": [
      "通过现有SLA和合规框架已明确定义质量需求的项目",
      "非正式对话即可实现同等质量需求对齐的极小团队",
      "概念验证后将重新审视质量属性的快速原型阶段"
    ],
    "adopters": [
      "Philips Healthcare",
      "Lockheed Martin",
      "Bosch",
      "Siemens",
      "US Department of Veterans Affairs"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "reliability",
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Mario Barbacci, Robert Ellison et al. (2003). \"Quality Attribute Workshop (QAWs), Third Edition\". SEI Technical Report CMU/SEI-2003-TR-016.",
    "secondary_sources": [
      "Len Bass, Paul Clements, and Rick Kazman (2003). \"Software Architecture in Practice, 2nd Edition\". Addison-Wesley.",
      "Rick Kazman et al. (2001). \"Quality Attribute Workshop Participants Handbook\". SEI Special Report CMU/SEI-2001-SR-001."
    ],
    "typed_relations": [
      {
        "slug": "atam",
        "type": "extends"
      },
      {
        "slug": "trade-off-sliders",
        "type": "complement"
      },
      {
        "slug": "four-golden-signals",
        "type": "complement"
      }
    ]
  },
  {
    "id": 25,
    "name": "Actor Model",
    "name_zh": "Actor 模型",
    "slug": "actor-model",
    "category": "architecture",
    "desc": "Concurrent computation using message-passing actors",
    "desc_zh": "通过消息传递的 Actor 实现并发计算模型",
    "steps": [
      "Identify concurrency boundaries: map system components that need to operate independently and communicate asynchronously",
      "Design actors: define each actor's state, behavior, and the set of messages it can receive and send",
      "Implement message passing: use an actor framework (Akka, Orleans, Erlang/OTP) to handle mailboxes, scheduling, and delivery guarantees",
      "Design supervision hierarchies: define parent-child relationships where supervisors decide restart, stop, or escalate strategies on child failure",
      "Test for message ordering and deadlocks: verify that the system handles out-of-order delivery, mailbox overflow, and actor lifecycle correctly"
    ],
    "steps_zh": [
      "识别并发边界：映射需要独立运行并异步通信的系统组件",
      "设计Actor：定义每个Actor的状态、行为以及可接收和发送的消息集合",
      "实现消息传递：使用Actor框架（Akka、Orleans、Erlang/OTP）处理邮箱、调度和投递保证",
      "设计监督层级：定义父子关系，由监督者决定子Actor失败时的重启、停止或升级策略",
      "测试消息顺序和死锁：验证系统正确处理乱序投递、邮箱溢出和Actor生命周期"
    ],
    "ai_relevant": false,
    "viz_type": "tree",
    "viz_labels": [
      "Actor",
      "Message",
      "Mailbox",
      "Behavior",
      "Spawn"
    ],
    "viz_labels_zh": [
      "Actor",
      "消息",
      "邮箱",
      "行为",
      "派生"
    ],
    "related": [
      "eda",
      "reactive-extensions",
      "agent-communication-protocol"
    ],
    "tags": [
      "concurrency",
      "message-passing",
      "actors",
      "fault-tolerance",
      "distributed"
    ],
    "origin_author": "Carl Hewitt, Peter Bishop, Richard Steiger, 1973",
    "origin_source": "A Universal Modular ACTOR Formalism for Artificial Intelligence (IJCAI 1973)",
    "origin_source_zh": "《一种面向人工智能的通用模块化ACTOR形式化方法》（1973年IJCAI会议）",
    "complexity": "intermediate",
    "when_to_use": [
      "When building highly concurrent systems that need to manage millions of lightweight concurrent entities",
      "When designing fault-tolerant systems that must self-heal from component failures",
      "When shared-memory concurrency (locks, mutexes) becomes too error-prone or bottlenecked",
      "When building distributed systems that need location-transparent communication between components"
    ],
    "when_to_use_zh": [
      "构建需要管理数百万轻量级并发实体的高并发系统时",
      "设计必须从组件故障中自我修复的容错系统时",
      "共享内存并发（锁、互斥量）变得过于容易出错或成为瓶颈时",
      "构建需要组件间位置透明通信的分布式系统时"
    ],
    "core_concepts": [
      "Actor: A fundamental unit of computation that encapsulates state, behavior, and a mailbox for receiving messages",
      "Message passing: Actors communicate exclusively through asynchronous, immutable messages with no shared state",
      "Supervision tree: Hierarchical fault handling where parent actors define recovery strategies for child actor failures",
      "Location transparency: Actors communicate through addresses that abstract away whether the target is local or remote",
      "Let it crash: Rather than defensive error handling everywhere, let actors fail and rely on supervisors to restart them"
    ],
    "core_concepts_zh": [
      "Actor：封装状态、行为和消息邮箱的基本计算单元",
      "消息传递：Actor仅通过异步、不可变的消息通信，不共享状态",
      "监督树：层级化的故障处理，父Actor为子Actor失败定义恢复策略",
      "位置透明：Actor通过抽象地址通信，屏蔽目标是本地还是远程",
      "任其崩溃：不在各处进行防御性错误处理，而是让Actor失败并依赖监督者重启"
    ],
    "timeline": [
      [
        "1973",
        "Carl Hewitt et al. publish the foundational Actor Model paper at IJCAI"
      ],
      [
        "1986",
        "Joe Armstrong creates Erlang at Ericsson, implementing the Actor Model for telecom systems"
      ],
      [
        "2006",
        "Philipp Haller introduces Scala Actors, later evolving into the Akka framework"
      ],
      [
        "2009",
        "Lightbend (Typesafe) releases Akka, bringing the Actor Model to the JVM ecosystem"
      ],
      [
        "2014",
        "Microsoft Research releases Orleans, a virtual actor framework for .NET cloud services"
      ]
    ],
    "timeline_zh": [
      [
        "1973",
        "Carl Hewitt等人在IJCAI发表奠基性的Actor模型论文"
      ],
      [
        "1986",
        "Joe Armstrong在爱立信创建Erlang，为电信系统实现Actor模型"
      ],
      [
        "2006",
        "Philipp Haller引入Scala Actors，后演变为Akka框架"
      ],
      [
        "2009",
        "Lightbend（Typesafe）发布Akka，将Actor模型带入JVM生态"
      ],
      [
        "2014",
        "微软研究院发布Orleans，面向.NET云服务的虚拟Actor框架"
      ]
    ],
    "dos": [
      "Do keep actor state private and communicate only through messages because shared state defeats the concurrency model",
      "Do design fine-grained actors with single responsibilities because coarse-grained actors become concurrency bottlenecks",
      "Do use supervision hierarchies to handle failures because actors will crash and need systematic recovery",
      "Do make messages immutable and serializable because they may cross thread, process, or network boundaries"
    ],
    "dos_zh": [
      "保持Actor状态私有，仅通过消息通信，因为共享状态会破坏并发模型",
      "设计职责单一的细粒度Actor，因为粗粒度Actor会成为并发瓶颈",
      "使用监督层级处理故障，因为Actor会崩溃且需要系统化恢复",
      "使消息不可变且可序列化，因为消息可能跨线程、进程或网络边界传递"
    ],
    "donts": [
      "Don't block inside actors with synchronous I/O because it defeats the asynchronous concurrency model",
      "Don't create too few, too large actors because it reintroduces the bottleneck problems actors solve",
      "Don't rely on message ordering between different actors because the model guarantees order only between a specific sender-receiver pair",
      "Don't expose actor internal state through shared references because it creates race conditions"
    ],
    "donts_zh": [
      "不要在Actor内部用同步I/O阻塞，因为这会破坏异步并发模型",
      "不要创建过少或过大的Actor，因为这会重新引入Actor模型要解决的瓶颈问题",
      "不要依赖不同Actor之间的消息顺序，因为模型仅保证特定发送者-接收者对之间的顺序",
      "不要通过共享引用暴露Actor内部状态，因为这会产生竞态条件"
    ],
    "case_study_company": "Ericsson (WhatsApp)",
    "case_study": "WhatsApp used Erlang/OTP's actor model to build its messaging backend, enabling a team of just 50 engineers to support 900 million users at the time of the Facebook acquisition. Each user connection was represented as a lightweight Erlang process (actor) consuming only about 2KB of memory. The supervision tree architecture meant that individual connection failures were isolated and automatically restarted without affecting other users, achieving 99.99% uptime.",
    "case_study_zh": "WhatsApp使用Erlang/OTP的Actor模型构建消息后端，使仅50名工程师的团队在被Facebook收购时能支持9亿用户。每个用户连接被表示为仅消耗约2KB内存的轻量级Erlang进程（Actor）。监督树架构意味着单个连接故障被隔离并自动重启，不影响其他用户，实现了99.99%的正常运行时间。",
    "when_not_to_use": [
      "Simple sequential programs that don't require concurrency or distribution",
      "CPU-bound computations that benefit more from parallel data processing than message-passing",
      "Systems with very strict latency requirements where message routing overhead is unacceptable"
    ],
    "when_not_to_use_zh": [
      "不需要并发或分布的简单顺序程序",
      "从并行数据处理中获益更多而非消息传递的CPU密集型计算",
      "消息路由开销不可接受的极严格延迟要求系统"
    ],
    "adopters": [
      "Ericsson",
      "WhatsApp",
      "Discord",
      "Microsoft (Orleans)",
      "Lightbend (Akka)"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "scalability",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Carl Hewitt, Peter Bishop, and Richard Steiger (1973). \"A Universal Modular ACTOR Formalism for Artificial Intelligence\". IJCAI 1973.",
    "secondary_sources": [
      "Gul Agha (1986). \"Actors: A Model of Concurrent Computation in Distributed Systems\". MIT Press.",
      "Vernon Vaughn (2016). \"Reactive Messaging Patterns with the Actor Model\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "eda",
        "type": "complement"
      },
      {
        "slug": "reactive-extensions",
        "type": "related"
      },
      {
        "slug": "agent-communication-protocol",
        "type": "related"
      }
    ]
  },
  {
    "id": 26,
    "name": "Dependency Injection",
    "name_zh": "依赖注入",
    "slug": "dependency-injection",
    "category": "architecture",
    "desc": "Invert control flow by injecting dependencies externally",
    "desc_zh": "通过外部注入依赖来反转控制流，降低模块耦合度",
    "steps": [
      "Identify tight couplings: find classes or modules that directly instantiate their dependencies, making them hard to test and swap",
      "Extract interfaces: define abstractions (interfaces or protocols) for each dependency so consumers depend on contracts, not implementations",
      "Configure a DI container or use constructor injection to wire dependencies at application startup or composition root",
      "Manage object lifetimes: decide scoping for each dependency — singleton, scoped (per-request), or transient (per-use)",
      "Validate the dependency graph: use the container's diagnostic tools or tests to detect circular dependencies, missing registrations, and captive dependencies"
    ],
    "steps_zh": [
      "识别紧耦合：找到直接实例化其依赖的类或模块，这使得它们难以测试和替换",
      "提取接口：为每个依赖定义抽象（接口或协议），使消费方依赖契约而非实现",
      "配置DI容器或使用构造函数注入，在应用启动时或组合根处装配依赖",
      "管理对象生命周期：为每个依赖决定作用域——单例、作用域（每请求）或瞬态（每次使用）",
      "验证依赖图：使用容器的诊断工具或测试检测循环依赖、缺失注册和捕获依赖"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Client",
      "Interface",
      "Injector",
      "Service"
    ],
    "viz_labels_zh": [
      "客户端",
      "抽象接口",
      "注入器",
      "服务实现"
    ],
    "related": [
      "solid-principles",
      "hexagonal-architecture",
      "clean-code-principles"
    ],
    "tags": [
      "dependency-injection",
      "inversion-of-control",
      "decoupling",
      "testability"
    ],
    "origin_author": "Martin Fowler, 2004",
    "origin_source": "Inversion of Control Containers and the Dependency Injection pattern (martinfowler.com, 2004)",
    "origin_source_zh": "《控制反转容器与依赖注入模式》（martinfowler.com，2004年）",
    "complexity": "beginner",
    "when_to_use": [
      "When classes have hard-coded dependencies that make unit testing with mocks impossible",
      "When you need to swap implementations at runtime or between environments (e.g., test vs. production)",
      "When building modular applications where components should be independently replaceable",
      "When applying SOLID principles, particularly the Dependency Inversion Principle"
    ],
    "when_to_use_zh": [
      "类存在硬编码依赖导致无法用Mock进行单元测试时",
      "需要在运行时或不同环境间（如测试与生产）切换实现时",
      "构建组件应可独立替换的模块化应用时",
      "应用SOLID原则，特别是依赖倒置原则时"
    ],
    "core_concepts": [
      "Constructor injection: Dependencies are provided through the constructor, making them explicit and required",
      "Composition root: The single place in the application where the entire dependency graph is assembled",
      "Interface segregation: Consumers depend on narrow abstractions rather than concrete classes",
      "Lifetime management: Controlling whether dependencies are singletons, scoped, or created fresh each time",
      "Service locator anti-pattern: Using a global registry to look up dependencies, which hides them from the API surface"
    ],
    "core_concepts_zh": [
      "构造函数注入：通过构造函数提供依赖，使依赖关系显式且必需",
      "组合根：应用中组装整个依赖图的唯一位置",
      "接口隔离：消费方依赖窄接口抽象而非具体类",
      "生命周期管理：控制依赖是单例、作用域还是每次创建新实例",
      "服务定位器反模式：使用全局注册表查找依赖，将依赖隐藏于API表面之下"
    ],
    "timeline": [
      [
        "1994",
        "Gang of Four 'Design Patterns' book introduces Strategy and Factory patterns as precursors"
      ],
      [
        "1998",
        "Apache Avalon framework introduces early IoC container concepts in Java"
      ],
      [
        "2003",
        "Spring Framework 1.0 released, popularizing DI in the Java ecosystem"
      ],
      [
        "2004",
        "Martin Fowler coins the term 'Dependency Injection' in his influential article"
      ],
      [
        "2016",
        "ASP.NET Core ships with a built-in DI container, making DI a first-class framework feature"
      ]
    ],
    "timeline_zh": [
      [
        "1994",
        "四人帮《设计模式》引入策略模式和工厂模式作为前身概念"
      ],
      [
        "1998",
        "Apache Avalon框架在Java中引入早期IoC容器概念"
      ],
      [
        "2003",
        "Spring Framework 1.0发布，在Java生态中推广DI"
      ],
      [
        "2004",
        "Martin Fowler在其影响深远的文章中创造「依赖注入」一词"
      ],
      [
        "2016",
        "ASP.NET Core内置DI容器发布，使DI成为框架一等特性"
      ]
    ],
    "dos": [
      "Do prefer constructor injection because it makes dependencies explicit, required, and visible in the API",
      "Do configure the DI container at a single composition root because scattered configuration creates confusion",
      "Do register dependencies by interface rather than concrete type because it preserves the abstraction benefit",
      "Do validate the container at startup because failing fast on missing registrations prevents runtime surprises"
    ],
    "dos_zh": [
      "优先使用构造函数注入，因为它使依赖显式、必需且在API中可见",
      "在单一组合根配置DI容器，因为分散的配置会造成混乱",
      "按接口而非具体类型注册依赖，因为这保留了抽象的好处",
      "在启动时验证容器，因为对缺失注册快速失败可防止运行时意外"
    ],
    "donts": [
      "Don't inject the DI container itself into classes because it creates a Service Locator anti-pattern",
      "Don't use property injection for required dependencies because they can be left null, causing runtime failures",
      "Don't create deep dependency chains because they make the system hard to understand and debug",
      "Don't register everything as singleton by default because incorrect lifetime scoping causes subtle concurrency bugs"
    ],
    "donts_zh": [
      "不要将DI容器本身注入到类中，因为这会产生服务定位器反模式",
      "不要对必需的依赖使用属性注入，因为它们可能为空导致运行时失败",
      "不要创建过深的依赖链，因为这使系统难以理解和调试",
      "不要默认将所有依赖注册为单例，因为错误的生命周期范围会导致隐蔽的并发缺陷"
    ],
    "case_study_company": "Google (Angular)",
    "case_study": "Google designed Angular's dependency injection system as a core framework feature, making it the first major frontend framework with built-in hierarchical DI. Angular's injector tree mirrors the component tree, allowing services to be scoped at module, component, or application level. This design enabled teams at Google to build large-scale web applications where components could be independently tested with mock services and swapped without modifying consuming code.",
    "case_study_zh": "Google将Angular的依赖注入系统设计为核心框架特性，使其成为首个内置层级化DI的主流前端框架。Angular的注入器树镜像组件树，允许服务在模块、组件或应用级别限定作用域。这种设计使Google的团队能够构建大规模Web应用，组件可以用Mock服务独立测试，并在不修改消费方代码的情况下替换。",
    "when_not_to_use": [
      "Simple scripts or small programs where the overhead of a DI framework exceeds the coupling problem it solves",
      "Performance-critical hot paths where the indirection of DI containers adds measurable latency",
      "Functional programming codebases where higher-order functions and closures naturally invert dependencies"
    ],
    "when_not_to_use_zh": [
      "DI框架的开销超过它所解决的耦合问题的简单脚本或小程序",
      "DI容器的间接性增加可测量延迟的性能关键热路径",
      "高阶函数和闭包自然反转依赖的函数式编程代码库"
    ],
    "adopters": [
      "Google (Angular)",
      "Pivotal (Spring)",
      "Microsoft (.NET)",
      "JetBrains (IntelliJ)",
      "NestJS"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "testability",
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Martin Fowler (2004). \"Inversion of Control Containers and the Dependency Injection Pattern\". martinfowler.com.",
    "secondary_sources": [
      "Mark Seemann (2011). \"Dependency Injection in .NET\". Manning Publications.",
      "Robert C. Martin (2017). \"Clean Architecture: A Craftsman's Guide to Software Structure and Design\". Prentice Hall."
    ],
    "typed_relations": [
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "hexagonal-architecture",
        "type": "related"
      },
      {
        "slug": "clean-code-principles",
        "type": "related"
      }
    ]
  },
  {
    "id": 27,
    "name": "Service Mesh Pattern",
    "name_zh": "服务网格模式",
    "slug": "service-mesh-pattern",
    "category": "architecture",
    "desc": "Dedicated infrastructure for service-to-service communication",
    "desc_zh": "为服务间通信提供专用基础设施层的架构模式",
    "steps": [
      "Assess readiness: evaluate whether your microservices count, operational complexity, and team maturity justify a service mesh investment",
      "Select a mesh implementation: compare options (Istio, Linkerd, Consul Connect) based on feature set, resource overhead, and ecosystem fit",
      "Deploy sidecar proxies: inject a proxy (e.g., Envoy) alongside each service instance to intercept all inbound and outbound traffic transparently",
      "Configure traffic policies: set up mutual TLS, retry budgets, circuit breaking, rate limiting, and traffic splitting rules via the mesh control plane",
      "Establish observability: leverage the mesh's built-in distributed tracing, metrics collection, and access logging for end-to-end service visibility"
    ],
    "steps_zh": [
      "评估就绪度：评估微服务数量、运维复杂度和团队成熟度是否足以证明服务网格的投入",
      "选择网格实现：根据功能集、资源开销和生态契合度比较选项（Istio、Linkerd、Consul Connect）",
      "部署边车代理：在每个服务实例旁注入代理（如Envoy），透明拦截所有入站和出站流量",
      "配置流量策略：通过网格控制面设置双向TLS、重试预算、熔断、限流和流量分割规则",
      "建立可观测性：利用网格内置的分布式追踪、指标采集和访问日志实现端到端服务可见性"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Service",
      "Sidecar Proxy",
      "Control Plane",
      "Data Plane"
    ],
    "viz_labels_zh": [
      "服务实例",
      "Sidecar代理",
      "控制平面",
      "数据平面"
    ],
    "related": [
      "microservices-decomposition",
      "circuit-breaker-pattern",
      "infrastructure-as-code"
    ],
    "tags": [
      "service-mesh",
      "sidecar",
      "traffic-management",
      "observability",
      "mTLS"
    ],
    "origin_author": "William Morgan (Buoyant), 2017",
    "origin_source": "What's a service mesh? And why do I need one? (buoyant.io blog, 2017); concept evolved from Linkerd and Envoy projects",
    "origin_source_zh": "《什么是服务网格？为什么我需要它？》（buoyant.io博客，2017年）；概念从Linkerd和Envoy项目演化而来",
    "complexity": "advanced",
    "when_to_use": [
      "When managing dozens or hundreds of microservices and cross-cutting concerns like mTLS, retries, and observability become repetitive in application code",
      "When zero-trust networking requires mutual TLS between all services without application code changes",
      "When canary deployments and traffic shifting need fine-grained, infrastructure-level control",
      "When distributed tracing and metrics need to be collected uniformly across all services"
    ],
    "when_to_use_zh": [
      "管理数十或数百个微服务且mTLS、重试、可观测性等横切关注点在应用代码中重复出现时",
      "零信任网络要求所有服务间双向TLS而不修改应用代码时",
      "金丝雀部署和流量切换需要细粒度基础设施级别控制时",
      "需要在所有服务中统一收集分布式追踪和指标时"
    ],
    "core_concepts": [
      "Sidecar proxy: A proxy (typically Envoy) deployed alongside each service to handle all network traffic transparently",
      "Control plane: Centralized management (Istiod, linkerd-control-plane) that configures all sidecar proxies",
      "Data plane: The collection of sidecar proxies that actually handle request routing and policy enforcement",
      "Mutual TLS (mTLS): Automatic encryption and identity verification between services without application changes",
      "Traffic management: Capabilities like canary releases, circuit breaking, retries, and rate limiting managed at infrastructure level"
    ],
    "core_concepts_zh": [
      "边车代理：部署在每个服务旁的代理（通常为Envoy），透明处理所有网络流量",
      "控制面：集中管理（Istiod、linkerd-control-plane），配置所有边车代理",
      "数据面：实际处理请求路由和策略执行的边车代理集合",
      "双向TLS（mTLS）：无需修改应用即可在服务间自动加密和身份验证",
      "流量管理：金丝雀发布、熔断、重试和限流等在基础设施层管理的能力"
    ],
    "timeline": [
      [
        "2016",
        "Linkerd 1.0 released by Buoyant as the first dedicated service mesh"
      ],
      [
        "2016",
        "Lyft open-sources Envoy proxy, which becomes the standard data plane"
      ],
      [
        "2017",
        "Google, IBM, and Lyft launch Istio, the most widely adopted service mesh"
      ],
      [
        "2019",
        "Service Mesh Interface (SMI) specification proposed for mesh interoperability"
      ],
      [
        "2023",
        "Istio ambient mesh introduces sidecar-less mode, reducing resource overhead"
      ]
    ],
    "timeline_zh": [
      [
        "2016",
        "Buoyant发布Linkerd 1.0，作为首个专用服务网格"
      ],
      [
        "2016",
        "Lyft开源Envoy代理，成为标准数据面"
      ],
      [
        "2017",
        "Google、IBM和Lyft推出Istio，成为最广泛采用的服务网格"
      ],
      [
        "2019",
        "服务网格接口（SMI）规范被提出以实现网格互操作性"
      ],
      [
        "2023",
        "Istio ambient mesh引入无边车模式，降低资源开销"
      ]
    ],
    "dos": [
      "Do start with observability features first because they provide immediate value with low risk",
      "Do adopt incrementally by enabling the mesh for a subset of services first because big-bang rollouts are risky",
      "Do monitor the resource overhead of sidecar proxies because they consume CPU and memory alongside each service",
      "Do use the mesh for mTLS everywhere because it eliminates an entire class of network security vulnerabilities"
    ],
    "dos_zh": [
      "优先从可观测性功能开始，因为它们以低风险提供即时价值",
      "增量采用，先为部分服务启用网格，因为大爆炸式推出风险很高",
      "监控边车代理的资源开销，因为它们在每个服务旁消耗CPU和内存",
      "全面使用网格实现mTLS，因为它消除了一整类网络安全漏洞"
    ],
    "donts": [
      "Don't adopt a service mesh for fewer than 10 services because the operational overhead outweighs the benefits",
      "Don't ignore the latency added by sidecar proxies because each hop adds 1-3ms that compounds across call chains",
      "Don't configure traffic policies without load testing because misconfigured retry budgets can cause retry storms",
      "Don't assume the mesh replaces application-level resilience because it complements rather than substitutes business logic error handling"
    ],
    "donts_zh": [
      "不要为少于10个服务采用服务网格，因为运维开销超过收益",
      "不要忽视边车代理增加的延迟，因为每一跳增加1-3毫秒并在调用链中累积",
      "不要未经负载测试就配置流量策略，因为错误配置的重试预算可能导致重试风暴",
      "不要假设网格替代应用级弹性，它是业务逻辑错误处理的补充而非替代"
    ],
    "case_study_company": "Lyft",
    "case_study": "Lyft built Envoy proxy to solve the challenge of managing service-to-service communication across hundreds of microservices written in multiple languages. Before Envoy, each service team implemented its own retry logic, circuit breaking, and observability, leading to inconsistent behavior. By deploying Envoy as a sidecar proxy for every service, Lyft achieved uniform mTLS, distributed tracing, and traffic management. Envoy was later donated to the CNCF and became the data plane for Istio and other service meshes.",
    "case_study_zh": "Lyft构建Envoy代理来解决数百个以多种语言编写的微服务之间管理服务间通信的挑战。在Envoy之前，每个服务团队自行实现重试逻辑、熔断和可观测性，导致行为不一致。通过为每个服务部署Envoy作为边车代理，Lyft实现了统一的mTLS、分布式追踪和流量管理。Envoy后来捐赠给CNCF，成为Istio和其他服务网格的数据面。",
    "when_not_to_use": [
      "Small deployments with fewer than 10 services where a simple API gateway or load balancer suffices",
      "Teams without Kubernetes expertise because most service meshes assume container orchestration",
      "Latency-sensitive applications where the added sidecar proxy hop is unacceptable"
    ],
    "when_not_to_use_zh": [
      "少于10个服务、简单API网关或负载均衡器即可满足需求的小型部署",
      "缺乏Kubernetes经验的团队，因为大多数服务网格假设容器编排环境",
      "边车代理额外跳转不可接受的延迟敏感应用"
    ],
    "adopters": [
      "Lyft",
      "Airbnb",
      "eBay",
      "Salesforce",
      "T-Mobile"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "observability"
    ],
    "maturity_ring": "established",
    "primary_source": "William Morgan (2017). \"What's a Service Mesh? And Why Do I Need One?\". buoyant.io.",
    "secondary_sources": [
      "Lee Calcote and Zack Butcher (2020). \"Istio: Up and Running\". O'Reilly Media.",
      "Matt Klein (2017). \"The Universal Data Plane API\". Envoy Proxy Blog."
    ],
    "typed_relations": [
      {
        "slug": "microservices-decomposition",
        "type": "complement"
      },
      {
        "slug": "circuit-breaker-pattern",
        "type": "related"
      },
      {
        "slug": "infrastructure-as-code",
        "type": "related"
      }
    ]
  },
  {
    "id": 28,
    "name": "LLM System Design Patterns",
    "name_zh": "LLM系统设计模式",
    "slug": "llm-system-design-patterns",
    "category": "architecture",
    "desc": "Architectural patterns for production LLM-powered applications",
    "desc_zh": "面向生产环境的LLM驱动应用架构模式集合",
    "steps": [
      "Select the primary integration pattern: RAG (retrieval-augmented), fine-tuned model, prompt chaining, or tool-use agent",
      "Design the prompt management layer: prompt templates, versioning, A/B testing infrastructure, and injection of context",
      "Choose memory and state architecture: in-context window, external vector store, structured DB, or episodic memory",
      "Define latency, cost, and quality trade-offs; implement routing to smaller/faster models for simpler sub-tasks",
      "Build evaluation infrastructure: automated evals, hallucination detection, guardrails, and human feedback loops for continuous improvement"
    ],
    "steps_zh": [
      "选择主要集成模式：RAG（检索增强生成）、微调模型、提示链或工具调用智能体",
      "设计提示管理层：提示模板、版本控制、A/B测试基础设施以及上下文注入",
      "选择记忆与状态架构：上下文窗口内、外部向量存储、结构化数据库或情节记忆",
      "定义延迟、成本和质量权衡；实现路由机制将简单子任务转发给更小更快的模型",
      "构建评估基础设施：自动化评测、幻觉检测、安全护栏和人类反馈循环以持续改进"
    ],
    "ai_relevant": true,
    "viz_type": "matrix",
    "viz_labels": [
      "RAG",
      "Agent Loop",
      "Tool Use",
      "Guardrails"
    ],
    "viz_labels_zh": [
      "检索增强",
      "智能体循环",
      "工具调用",
      "安全护栏"
    ],
    "related": [
      "rag-architecture",
      "react-framework",
      "prompt-chaining",
      "guardrails-framework"
    ],
    "tags": [
      "llm",
      "ai-architecture",
      "rag",
      "prompt-engineering",
      "production"
    ],
    "origin_author": "Community-evolved; key contributors include Harrison Chase (LangChain, 2022) and Jerry Liu (LlamaIndex, 2022)",
    "origin_source": "Patterns of LLM Application Design (various: LangChain docs, Chip Huyen's blog, Anthropic's guides)",
    "origin_source_zh": "《LLM应用设计模式》（多来源：LangChain文档、Chip Huyen博客、Anthropic指南）",
    "complexity": "advanced",
    "when_to_use": [
      "When building production applications that integrate LLMs for generation, summarization, or reasoning tasks",
      "When you need to ground LLM outputs in domain-specific knowledge using retrieval-augmented generation",
      "When designing multi-step agent workflows that require tool use, planning, and iterative refinement",
      "When cost, latency, and quality trade-offs require intelligent routing between different model sizes"
    ],
    "when_to_use_zh": [
      "构建集成LLM用于生成、摘要或推理任务的生产应用时",
      "需要使用检索增强生成将LLM输出锚定在领域特定知识中时",
      "设计需要工具使用、规划和迭代改进的多步骤智能体工作流时",
      "成本、延迟和质量权衡需要在不同模型大小之间智能路由时"
    ],
    "core_concepts": [
      "RAG (Retrieval-Augmented Generation): Enhancing LLM responses by retrieving relevant documents from a vector store before generation",
      "Prompt chaining: Breaking complex tasks into sequential LLM calls where each step's output feeds the next",
      "Tool use / function calling: Enabling LLMs to invoke external APIs, databases, or code execution environments",
      "Model routing: Directing requests to different model sizes based on task complexity to optimize cost and latency",
      "Guardrails: Input/output validation layers that prevent harmful, off-topic, or hallucinated responses"
    ],
    "core_concepts_zh": [
      "RAG（检索增强生成）：在生成前从向量存储中检索相关文档以增强LLM响应",
      "提示链：将复杂任务分解为顺序LLM调用，每步输出作为下一步输入",
      "工具使用/函数调用：使LLM能够调用外部API、数据库或代码执行环境",
      "模型路由：根据任务复杂度将请求导向不同大小的模型以优化成本和延迟",
      "安全护栏：防止有害、离题或幻觉响应的输入/输出验证层"
    ],
    "timeline": [
      [
        "2020",
        "OpenAI releases GPT-3 API, sparking the first wave of LLM application development"
      ],
      [
        "2022",
        "LangChain and LlamaIndex frameworks launch, establishing common LLM integration patterns"
      ],
      [
        "2023",
        "RAG pattern becomes the dominant approach for grounding LLM outputs in enterprise data"
      ],
      [
        "2024",
        "Agentic architectures emerge with tool use, planning loops, and multi-agent coordination"
      ],
      [
        "2025",
        "LLM system design matures with standardized evaluation frameworks, cost optimization, and production observability"
      ]
    ],
    "timeline_zh": [
      [
        "2020",
        "OpenAI发布GPT-3 API，引发第一波LLM应用开发浪潮"
      ],
      [
        "2022",
        "LangChain和LlamaIndex框架发布，建立通用LLM集成模式"
      ],
      [
        "2023",
        "RAG模式成为将LLM输出锚定在企业数据中的主导方法"
      ],
      [
        "2024",
        "智能体架构兴起，支持工具使用、规划循环和多智能体协调"
      ],
      [
        "2025",
        "LLM系统设计随标准化评估框架、成本优化和生产可观测性趋于成熟"
      ]
    ],
    "dos": [
      "Do implement structured evaluation pipelines because 'it looks good' is not a reliable quality measure for LLM outputs",
      "Do version your prompts and treat them as code artifacts because prompt changes directly affect output quality",
      "Do use RAG before fine-tuning because it is cheaper, faster to iterate, and keeps the knowledge updatable",
      "Do add guardrails for input validation and output safety because LLMs can be manipulated through prompt injection"
    ],
    "dos_zh": [
      "实现结构化评估管道，因为「看起来不错」不是LLM输出的可靠质量度量",
      "对提示进行版本控制并将其视为代码产物，因为提示变更直接影响输出质量",
      "在微调之前先使用RAG，因为它更便宜、迭代更快且知识可更新",
      "添加输入验证和输出安全的护栏，因为LLM可能被提示注入攻击操纵"
    ],
    "donts": [
      "Don't send raw user input directly to the LLM because it opens the door to prompt injection attacks",
      "Don't rely solely on temperature settings for output diversity because structured sampling strategies give more control",
      "Don't build monolithic prompt chains because a failure in one step cascades and is hard to debug",
      "Don't skip cost monitoring because LLM API costs can grow exponentially with scale and retries"
    ],
    "donts_zh": [
      "不要将原始用户输入直接发送给LLM，因为这为提示注入攻击打开了大门",
      "不要仅依赖温度设置来控制输出多样性，因为结构化采样策略提供更多控制",
      "不要构建单体提示链，因为一步失败会级联影响且难以调试",
      "不要跳过成本监控，因为LLM API费用可能随规模和重试呈指数增长"
    ],
    "case_study_company": "Klarna",
    "case_study": "Klarna deployed an AI assistant powered by OpenAI that handled two-thirds of all customer service chats within its first month, performing the equivalent work of 700 full-time agents. The system uses RAG to retrieve relevant order data and company policies, prompt chaining for multi-turn conversation management, and guardrails to prevent the AI from making unauthorized commitments. Klarna reported a 25% reduction in repeat inquiries and average resolution time dropping from 11 minutes to under 2 minutes.",
    "case_study_zh": "Klarna部署了由OpenAI驱动的AI助手，在上线首月即处理了三分之二的客服聊天，相当于700名全职客服的工作量。该系统使用RAG检索相关订单数据和公司政策，使用提示链管理多轮对话，并使用护栏防止AI做出未授权的承诺。Klarna报告重复咨询减少25%，平均解决时间从11分钟降至不到2分钟。",
    "when_not_to_use": [
      "Deterministic computation tasks where traditional algorithms provide exact, reproducible results",
      "Highly regulated domains where LLM non-determinism and hallucination risk are unacceptable without human oversight",
      "Low-latency paths where LLM inference time (hundreds of milliseconds to seconds) exceeds requirements"
    ],
    "when_not_to_use_zh": [
      "传统算法能提供精确可重现结果的确定性计算任务",
      "LLM的不确定性和幻觉风险在没有人工监督的情况下不可接受的高度监管领域",
      "LLM推理时间（数百毫秒到数秒）超出要求的低延迟路径"
    ],
    "adopters": [
      "Klarna",
      "Notion",
      "Shopify",
      "Stripe",
      "Replit"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Harrison Chase (2022). \"LangChain: Building Applications with LLMs through Composability\". github.com/langchain-ai.",
    "secondary_sources": [
      "Chip Huyen (2023). \"Building LLM Applications for Production\". huyenchip.com.",
      "Anthropic (2024). \"Building Effective Agents\". anthropic.com."
    ],
    "typed_relations": [
      {
        "slug": "rag-architecture",
        "type": "complement"
      },
      {
        "slug": "react-framework",
        "type": "related"
      },
      {
        "slug": "prompt-chaining",
        "type": "related"
      },
      {
        "slug": "guardrails-framework",
        "type": "related"
      }
    ]
  },
  {
    "id": 167,
    "name": "Layered Architecture",
    "name_zh": "分层架构",
    "slug": "layered-architecture",
    "category": "architecture",
    "desc": "Traditional n-tier architecture separating presentation, business logic, and data access into horizontal layers with strict dependency rules",
    "desc_zh": "将表现层、业务逻辑层和数据访问层分离为水平层次，并设定严格依赖规则的传统N层架构",
    "steps": [
      "Identify the logical layers required for your system (typically presentation, business logic, persistence, and database)",
      "Define strict dependency rules: each layer may only depend on the layer directly below it, never upward or across",
      "Design interfaces between layers so that each layer exposes a well-defined API to the layer above",
      "Implement each layer as an independent module with its own internal structure and responsibilities",
      "Validate layer isolation by ensuring no circular dependencies exist and that changes in one layer do not ripple through others"
    ],
    "steps_zh": [
      "识别系统所需的逻辑层（通常为表现层、业务逻辑层、持久化层和数据库层）",
      "定义严格的依赖规则：每一层只能依赖其直接下层，不可向上或跨层依赖",
      "设计层间接口，使每一层向上层暴露明确定义的API",
      "将每一层实现为独立模块，拥有各自的内部结构和职责",
      "验证层隔离性，确保不存在循环依赖，且一层的变更不会波及其他层"
    ],
    "ai_relevant": false,
    "viz_type": "pyramid",
    "viz_labels": [
      "Presentation",
      "Business Logic",
      "Data Access",
      "Database"
    ],
    "viz_labels_zh": [
      "展示层",
      "业务逻辑层",
      "数据访问层",
      "数据库层"
    ],
    "related": [
      "modular-monolith",
      "c4-model",
      "hexagonal-architecture"
    ],
    "tags": [
      "n-tier",
      "separation-of-concerns",
      "layering",
      "monolith",
      "dependency-rules"
    ],
    "origin_author": "Frank Buschmann, Regine Meunier, Hans Rohnert, Peter Sommerlad, Michael Stal, 1996",
    "origin_source": "Pattern-Oriented Software Architecture, Volume 1: A System of Patterns (Wiley, 1996)",
    "origin_source_zh": "《面向模式的软件架构·卷1：模式系统》（Wiley，1996年）",
    "complexity": "beginner",
    "when_to_use": [
      "When building traditional enterprise applications with clear separation between UI, business rules, and data storage",
      "When the team is relatively small and needs a straightforward, well-understood architecture pattern",
      "When the application has moderate complexity and does not require independent deployment of components",
      "When regulatory or organizational standards mandate a clear separation of concerns between tiers"
    ],
    "when_to_use_zh": [
      "当构建具有UI、业务规则和数据存储之间清晰分离的传统企业应用时",
      "当团队规模较小，需要一种直观且被广泛理解的架构模式时",
      "当应用具有中等复杂度，不需要组件独立部署时",
      "当法规或组织标准要求各层之间具有明确的关注点分离时"
    ],
    "core_concepts": [
      "Layer isolation: Each layer encapsulates a specific concern and communicates only through well-defined interfaces",
      "Top-down dependency: Dependencies flow strictly downward from presentation to data, never upward",
      "Closed layers: Requests must pass through each layer sequentially without skipping, enforcing separation",
      "Open layers: Optional variant where certain layers can be bypassed for performance-critical paths",
      "Layer cohesion: Each layer groups related functionality together, making the system easier to understand and maintain"
    ],
    "core_concepts_zh": [
      "层隔离：每一层封装特定关注点，仅通过明确定义的接口进行通信",
      "自顶向下依赖：依赖关系严格从表现层向数据层流动，不可向上",
      "封闭层：请求必须依次通过每一层，不可跳跃，以强制分离",
      "开放层：可选变体，允许某些层在性能关键路径中被绕过",
      "层内聚：每一层将相关功能组合在一起，使系统更易于理解和维护"
    ],
    "timeline": [
      [
        "1968",
        "Dijkstra introduces layered design principles in the THE multiprogramming system"
      ],
      [
        "1996",
        "Buschmann et al. formalize the Layers pattern in POSA Volume 1"
      ],
      [
        "2003",
        "Java EE popularizes the three-tier architecture (JSP/Servlet, EJB, JDBC) in enterprise development"
      ],
      [
        "2014",
        "Richards describes layered architecture as the de facto standard in Software Architecture Patterns"
      ],
      [
        "2020",
        "Richards and Ford analyze layered architecture trade-offs in Fundamentals of Software Architecture"
      ]
    ],
    "timeline_zh": [
      [
        "1968",
        "Dijkstra在THE多道程序系统中引入分层设计原则"
      ],
      [
        "1996",
        "Buschmann等人在POSA卷1中正式定义「层」模式"
      ],
      [
        "2003",
        "Java EE以三层架构（JSP/Servlet、EJB、JDBC）在企业开发中普及"
      ],
      [
        "2014",
        "Richards在《软件架构模式》中将分层架构描述为事实标准"
      ],
      [
        "2020",
        "Richards和Ford在《软件架构基础》中分析分层架构的权衡取舍"
      ]
    ],
    "dos": [
      "Do enforce strict layer boundaries with compiler or build-tool checks because accidental cross-layer dependencies erode the architecture over time",
      "Do use dependency inversion at layer boundaries because it allows layers to be tested and replaced independently",
      "Do keep layers thin and focused because bloated layers become monolithic sub-systems that are hard to change",
      "Do document which layers are open versus closed because ambiguity leads to inconsistent layer bypass decisions"
    ],
    "dos_zh": [
      "使用编译器或构建工具检查强制严格的层边界，因为意外的跨层依赖会随时间侵蚀架构",
      "在层边界使用依赖反转，因为这允许各层独立测试和替换",
      "保持每层精简聚焦，因为臃肿的层会变成难以变更的单体子系统",
      "记录哪些层是开放的、哪些是封闭的，因为模糊性会导致不一致的层绕过决策"
    ],
    "donts": [
      "Don't create a sinkhole anti-pattern where requests pass through layers without any processing because it indicates unnecessary layering",
      "Don't allow the presentation layer to access the database directly because it defeats the purpose of separation",
      "Don't put business logic in the presentation or persistence layers because it creates tight coupling and duplication",
      "Don't add layers just for symmetry because unnecessary layers add latency and complexity without value"
    ],
    "donts_zh": [
      "不要形成「天坑」反模式——请求穿过各层却不做任何处理——这表明存在不必要的分层",
      "不要让表现层直接访问数据库，因为这违背了分离的初衷",
      "不要将业务逻辑放在表现层或持久化层中，因为这会造成紧耦合和重复",
      "不要为了对称而添加层，不必要的层只会增加延迟和复杂度而无价值"
    ],
    "case_study_company": "SAP",
    "case_study": "SAP's ERP system has historically been built on a layered architecture with a clear separation between the UI layer (SAP GUI/Fiori), the application server (ABAP business logic), and the database layer (initially Oracle/DB2, later HANA). This layered approach allowed SAP to migrate its database layer from traditional RDBMS to the in-memory HANA platform without rewriting the business logic or UI layers. The migration preserved decades of business rules while delivering 10-100x performance improvements in analytical queries.",
    "case_study_zh": "SAP的ERP系统历来基于分层架构构建，UI层（SAP GUI/Fiori）、应用服务器（ABAP业务逻辑）和数据库层（最初为Oracle/DB2，后为HANA）之间清晰分离。这种分层方法使SAP能够将数据库层从传统RDBMS迁移到内存数据库HANA平台，而无需重写业务逻辑或UI层。此次迁移保留了数十年的业务规则，同时在分析查询中实现了10到100倍的性能提升。",
    "when_not_to_use": [
      "Highly distributed systems requiring independent scaling of individual components",
      "Real-time event-driven applications where synchronous layer traversal introduces unacceptable latency",
      "Microservices environments where teams need autonomous deployment and technology heterogeneity"
    ],
    "when_not_to_use_zh": [
      "需要对各组件独立扩展的高度分布式系统",
      "同步层遍历会引入不可接受延迟的实时事件驱动应用",
      "团队需要自主部署和技术异构性的微服务环境"
    ],
    "adopters": [
      "SAP",
      "Oracle",
      "IBM",
      "Microsoft",
      "Salesforce"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Frank Buschmann et al. (1996). \"Pattern-Oriented Software Architecture, Volume 1: A System of Patterns\". Wiley.",
    "secondary_sources": [
      "Mark Richards and Neal Ford (2020). \"Fundamentals of Software Architecture\". O'Reilly Media.",
      "Martin Fowler (2002). \"Patterns of Enterprise Application Architecture\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "modular-monolith",
        "type": "complement"
      },
      {
        "slug": "c4-model",
        "type": "related"
      },
      {
        "slug": "hexagonal-architecture",
        "type": "related"
      }
    ]
  },
  {
    "id": 168,
    "name": "Modular Monolith",
    "name_zh": "模块化单体",
    "slug": "modular-monolith",
    "category": "architecture",
    "desc": "A single deployable unit with strictly enforced module boundaries, combining monolith simplicity with modular maintainability",
    "desc_zh": "具有严格模块边界的单一可部署单元，结合了单体的简洁性和模块化的可维护性",
    "steps": [
      "Decompose the domain into bounded contexts and define each as an independent module with its own public API",
      "Enforce module boundaries using compiler-level access controls, build tool configurations, or architecture fitness functions",
      "Ensure each module owns its data: use separate schemas or table prefixes to prevent direct cross-module database access",
      "Define explicit inter-module communication through public interfaces, events, or a mediator rather than shared mutable state",
      "Continuously validate module boundaries through automated tests and dependency analysis tools to prevent erosion over time"
    ],
    "steps_zh": [
      "将领域分解为限界上下文，并将每个上下文定义为拥有独立公共API的独立模块",
      "使用编译器级别的访问控制、构建工具配置或架构适应度函数来强制模块边界",
      "确保每个模块拥有自己的数据：使用独立schema或表前缀防止跨模块直接访问数据库",
      "通过公共接口、事件或中介者定义明确的模块间通信——绝不通过共享可变状态",
      "通过自动化测试和依赖分析工具持续验证模块边界，防止随时间退化"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Module",
      "Boundary",
      "Public API",
      "Shared Kernel"
    ],
    "viz_labels_zh": [
      "模块",
      "边界",
      "公共API",
      "共享内核"
    ],
    "related": [
      "layered-architecture",
      "microservices-decomposition",
      "c4-model"
    ],
    "tags": [
      "modularity",
      "monolith",
      "bounded-context",
      "encapsulation",
      "decomposition"
    ],
    "origin_author": "Simon Brown, 2015; elaborated by Mark Richards and Neal Ford, 2020",
    "origin_source": "Fundamentals of Software Architecture (O'Reilly, Richards & Ford, 2020)",
    "origin_source_zh": "《软件架构基础》（O'Reilly，Richards和Ford，2020年）",
    "complexity": "intermediate",
    "when_to_use": [
      "When a team wants modular design benefits without the operational complexity of distributed microservices",
      "When the application is in early stages and domain boundaries are still being discovered",
      "When deployment simplicity is valued but long-term maintainability through clear module separation is also critical",
      "When transitioning from a traditional monolith toward better structure as a stepping stone to potential future decomposition"
    ],
    "when_to_use_zh": [
      "当团队希望获得模块化设计的好处，又不想承担分布式微服务的运维复杂性时",
      "当应用处于早期阶段，领域边界仍在探索中时",
      "当重视部署简洁性，但同时需要通过清晰的模块分离确保长期可维护性时",
      "当从传统单体向更好的结构过渡，作为未来潜在拆分的过渡阶段时"
    ],
    "core_concepts": [
      "Module boundary enforcement: Using technical mechanisms (access modifiers, build rules) to prevent unauthorized cross-module dependencies",
      "Data ownership: Each module exclusively owns its data store or schema, with no shared tables between modules",
      "Public API surface: Modules expose only intentional interfaces, hiding implementation details behind encapsulation",
      "Internal event bus: Modules communicate asynchronously through domain events rather than direct method calls for loose coupling",
      "Decomposability: The architecture is designed so that any module can be extracted into a separate service if needed in the future"
    ],
    "core_concepts_zh": [
      "模块边界强制：使用技术机制（访问修饰符、构建规则）防止未授权的跨模块依赖",
      "数据所有权：每个模块独占拥有自己的数据存储或schema，模块间不共享表",
      "公共API面：模块仅暴露有意设计的接口，通过封装隐藏实现细节",
      "内部事件总线：模块通过领域事件异步通信，而非直接方法调用，以实现松耦合",
      "可拆分性：架构设计保证任何模块在未来需要时都能被提取为独立服务"
    ],
    "timeline": [
      [
        "2003",
        "Eric Evans publishes Domain-Driven Design introducing bounded contexts that underpin modular monolith thinking"
      ],
      [
        "2015",
        "Simon Brown advocates for the modular monolith as a pragmatic alternative to premature microservices adoption"
      ],
      [
        "2018",
        "Shopify publicly shares its migration from a Rails monolith to a modular monolith architecture"
      ],
      [
        "2020",
        "Richards and Ford formalize modular monolith patterns in Fundamentals of Software Architecture"
      ],
      [
        "2023",
        "Growing industry backlash against microservices complexity drives renewed interest in modular monolith approaches"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "Eric Evans出版《领域驱动设计》，引入支撑模块化单体思想的限界上下文概念"
      ],
      [
        "2015",
        "Simon Brown倡导模块化单体作为过早采用微服务的务实替代方案"
      ],
      [
        "2018",
        "Shopify公开分享从Rails单体迁移到模块化单体架构的经验"
      ],
      [
        "2020",
        "Richards和Ford在《软件架构基础》中正式定义模块化单体模式"
      ],
      [
        "2023",
        "业界对微服务复杂性的日益反思推动了对模块化单体方法的重新关注"
      ]
    ],
    "dos": [
      "Do enforce module boundaries with automated tooling because human discipline alone is insufficient to prevent coupling over time",
      "Do give each module its own database schema because shared tables are the fastest path to hidden coupling",
      "Do design module APIs as if they were remote service contracts because this makes future extraction straightforward",
      "Do use domain events for cross-module communication because it preserves loose coupling and module autonomy"
    ],
    "dos_zh": [
      "使用自动化工具强制模块边界，因为仅靠人的自律不足以长期防止耦合",
      "给每个模块独立的数据库schema，因为共享表是隐性耦合的最快路径",
      "将模块API设计得如同远程服务契约，因为这使未来的提取变得简单",
      "使用领域事件进行跨模块通信，因为这能保持松耦合和模块自治"
    ],
    "donts": [
      "Don't allow modules to share domain models because it creates semantic coupling that is extremely hard to untangle",
      "Don't skip module boundary tests because without automated verification boundaries erode within months",
      "Don't create a module for every class or small concern because over-modularization adds unnecessary complexity",
      "Don't use direct database joins across module boundaries because it tightly couples modules at the data layer"
    ],
    "donts_zh": [
      "不要让模块共享领域模型，因为这会造成极难解开的语义耦合",
      "不要跳过模块边界测试，因为没有自动化验证，边界在数月内就会退化",
      "不要为每个类或小关注点创建模块，因为过度模块化会增加不必要的复杂度",
      "不要跨模块边界进行直接数据库关联查询，因为这会在数据层紧密耦合模块"
    ],
    "case_study_company": "Shopify",
    "case_study": "Shopify migrated its massive Ruby on Rails monolith serving millions of merchants into a modular monolith by defining clear module boundaries around domain concepts like orders, inventory, and payments. They used the Packwerk tool to enforce dependency rules at the package level, preventing unauthorized cross-module references. This approach allowed Shopify to improve developer velocity while avoiding the operational overhead of hundreds of microservices. Build times and test isolation improved significantly, and the architecture remains a single deployable unit.",
    "case_study_zh": "Shopify将其服务数百万商家的大型Ruby on Rails单体迁移为模块化单体，围绕订单、库存和支付等领域概念定义清晰的模块边界。他们使用Packwerk工具在包级别强制依赖规则，防止未授权的跨模块引用。这种方法使Shopify提高了开发者效率，同时避免了数百个微服务的运维开销。构建时间和测试隔离性显著改善，架构仍然是单一可部署单元。",
    "when_not_to_use": [
      "When different modules have fundamentally different scaling requirements that demand independent horizontal scaling",
      "When teams are geographically distributed and need fully autonomous deployment pipelines for each component",
      "When the system requires polyglot persistence or different technology stacks for different modules"
    ],
    "when_not_to_use_zh": [
      "当不同模块具有根本不同的扩展需求，需要独立的水平扩展时",
      "当团队地理分散，需要每个组件拥有完全自主的部署管道时",
      "当系统需要多语言持久化或不同模块使用不同的技术栈时"
    ],
    "adopters": [
      "Shopify",
      "Basecamp",
      "Gusto",
      "Maersk",
      "GitHub"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability",
      "scalability"
    ],
    "maturity_ring": "established",
    "primary_source": "Mark Richards and Neal Ford (2020). \"Fundamentals of Software Architecture\". O'Reilly Media.",
    "secondary_sources": [
      "Simon Brown (2015). \"Modular Monoliths\". conference talk, various venues.",
      "Sam Newman (2019). \"Monolith to Microservices: Evolutionary Patterns to Transform Your Monolith\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "layered-architecture",
        "type": "complement"
      },
      {
        "slug": "microservices-decomposition",
        "type": "related"
      },
      {
        "slug": "c4-model",
        "type": "related"
      }
    ]
  },
  {
    "id": 169,
    "name": "Space-Based Architecture",
    "name_zh": "基于空间的架构",
    "slug": "space-based-architecture",
    "category": "architecture",
    "desc": "Distributed architecture using in-memory data grids and processing units to achieve high scalability by eliminating the central database bottleneck",
    "desc_zh": "使用内存数据网格和处理单元消除中央数据库瓶颈，实现高可扩展性的分布式架构",
    "steps": [
      "Identify processing units: self-contained modules that include business logic, an in-memory data grid, and optional async persistence",
      "Deploy a virtualized middleware layer that manages data replication, request routing, and processing unit orchestration",
      "Configure the in-memory data grid for data replication across processing units so each unit has a local copy of shared data",
      "Implement asynchronous data pumps to persist in-memory state to the database for durability without blocking request processing",
      "Add elasticity by configuring the middleware to dynamically spin up or shut down processing units based on load thresholds"
    ],
    "steps_zh": [
      "识别处理单元：包含业务逻辑、内存数据网格和可选异步持久化的自包含模块",
      "部署虚拟化中间件层，管理数据复制、请求路由和处理单元编排",
      "配置内存数据网格在处理单元间进行数据复制，使每个单元拥有共享数据的本地副本",
      "实现异步数据泵，将内存状态持久化到数据库以确保持久性，而不阻塞请求处理",
      "通过配置中间件根据负载阈值动态启动或关闭处理单元来增加弹性"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Processing Unit",
      "Tuple Space",
      "Messaging Grid",
      "Data Grid"
    ],
    "viz_labels_zh": [
      "处理单元",
      "元组空间",
      "消息网格",
      "数据网格"
    ],
    "related": [
      "microservices-decomposition",
      "eda",
      "cqrs-pattern"
    ],
    "tags": [
      "scalability",
      "in-memory",
      "data-grid",
      "elasticity",
      "distributed-processing"
    ],
    "origin_author": "Mark Richards, 2006; formalized in Richards & Ford, 2020",
    "origin_source": "Fundamentals of Software Architecture (O'Reilly, Richards & Ford, 2020); originally from Software Architecture Patterns (O'Reilly, 2015)",
    "origin_source_zh": "《软件架构基础》（O'Reilly，Richards和Ford，2020年）；最初来自《软件架构模式》（O'Reilly，2015年）",
    "complexity": "advanced",
    "when_to_use": [
      "When the application must handle extreme spikes in concurrent users, such as concert ticket sales or flash sales",
      "When the central database is the primary bottleneck and traditional scaling strategies have been exhausted",
      "When ultra-low latency is required and data can be served from in-memory grids rather than disk-based stores",
      "When the system needs elastic scaling that responds to load changes in seconds rather than minutes"
    ],
    "when_to_use_zh": [
      "当应用必须处理并发用户的极端峰值，如演唱会售票或闪购时",
      "当中央数据库是主要瓶颈且传统扩展策略已用尽时",
      "当需要超低延迟且数据可从内存网格而非磁盘存储中提供时",
      "当系统需要在数秒而非数分钟内响应负载变化的弹性扩展时"
    ],
    "core_concepts": [
      "Processing unit: A self-contained deployment unit with business logic, in-memory data, and optional async persistence bundled together",
      "Virtualized middleware: The infrastructure layer that handles routing, session management, data replication, and unit orchestration",
      "In-memory data grid: Distributed caching technology that replicates data across processing units for fast local access",
      "Data pump: Asynchronous component that writes in-memory data changes to the persistent store without blocking user requests",
      "Elastic scaling: The ability to add or remove processing units dynamically based on real-time load metrics"
    ],
    "core_concepts_zh": [
      "处理单元：将业务逻辑、内存数据和可选异步持久化捆绑在一起的自包含部署单元",
      "虚拟化中间件：处理路由、会话管理、数据复制和单元编排的基础设施层",
      "内存数据网格：在处理单元间复制数据以实现快速本地访问的分布式缓存技术",
      "数据泵：将内存数据变更异步写入持久存储而不阻塞用户请求的组件",
      "弹性扩展：基于实时负载指标动态添加或移除处理单元的能力"
    ],
    "timeline": [
      [
        "2004",
        "JavaSpaces and tuple space concepts inspire distributed in-memory processing patterns"
      ],
      [
        "2006",
        "Mark Richards introduces space-based architecture as a pattern for extreme scalability"
      ],
      [
        "2012",
        "GigaSpaces, Hazelcast, and Apache Ignite mature as enterprise in-memory data grid platforms"
      ],
      [
        "2015",
        "Richards publishes Software Architecture Patterns with a dedicated chapter on space-based architecture"
      ],
      [
        "2020",
        "Richards and Ford provide comprehensive analysis of space-based architecture in Fundamentals of Software Architecture"
      ]
    ],
    "timeline_zh": [
      [
        "2004",
        "JavaSpaces和元组空间概念启发了分布式内存处理模式"
      ],
      [
        "2006",
        "Mark Richards提出基于空间的架构作为极端可扩展性的模式"
      ],
      [
        "2012",
        "GigaSpaces、Hazelcast和Apache Ignite作为企业内存数据网格平台趋于成熟"
      ],
      [
        "2015",
        "Richards出版《软件架构模式》，设专章讨论基于空间的架构"
      ],
      [
        "2020",
        "Richards和Ford在《软件架构基础》中对基于空间的架构进行全面分析"
      ]
    ],
    "dos": [
      "Do design processing units to be stateless across requests because it enables seamless horizontal scaling and failover",
      "Do implement robust data replication monitoring because silent replication failures lead to data inconsistency",
      "Do use asynchronous data pumps for persistence because synchronous writes to the database reintroduce the bottleneck",
      "Do test with realistic load spikes because space-based architecture only proves its value under extreme concurrency"
    ],
    "dos_zh": [
      "将处理单元设计为跨请求无状态，因为这能实现无缝的水平扩展和故障转移",
      "实施健壮的数据复制监控，因为静默的复制失败会导致数据不一致",
      "使用异步数据泵进行持久化，因为同步写入数据库会重新引入瓶颈",
      "使用真实的负载峰值进行测试，因为基于空间的架构只有在极端并发下才能证明其价值"
    ],
    "donts": [
      "Don't use space-based architecture for systems with low concurrency because the complexity is not justified for moderate loads",
      "Don't assume eventual consistency is acceptable for all operations because some transactions require strong consistency guarantees",
      "Don't neglect data collision handling because concurrent updates to the same data across processing units cause conflicts",
      "Don't skip capacity planning for the in-memory grid because memory exhaustion causes cascading failures"
    ],
    "donts_zh": [
      "不要在低并发系统中使用基于空间的架构，因为对中等负载而言复杂度不合理",
      "不要假设最终一致性对所有操作都可接受，因为某些事务需要强一致性保证",
      "不要忽视数据冲突处理，因为处理单元间对同一数据的并发更新会造成冲突",
      "不要跳过内存网格的容量规划，因为内存耗尽会导致级联故障"
    ],
    "case_study_company": "Ticketmaster",
    "case_study": "Ticketmaster adopted a space-based architecture to handle the extreme traffic spikes during major concert on-sales, where millions of users simultaneously compete for limited tickets. By deploying processing units with in-memory data grids, Ticketmaster eliminated the database as the bottleneck during peak demand. Each processing unit holds a replicated copy of available inventory, enabling sub-millisecond seat lookups. Asynchronous data pumps persist purchase confirmations to the database. This architecture allowed Ticketmaster to handle over 14 billion API calls per day during peak events while maintaining response times under 50 milliseconds.",
    "case_study_zh": "Ticketmaster采用基于空间的架构来处理大型演唱会开售时的极端流量峰值，数百万用户同时竞争有限的门票。通过部署带有内存数据网格的处理单元，Ticketmaster消除了峰值需求时数据库的瓶颈。每个处理单元持有可用库存的复制副本，实现亚毫秒级的座位查询。异步数据泵将购买确认持久化到数据库。该架构使Ticketmaster在峰值事件期间每天处理超过140亿次API调用，同时保持响应时间在50毫秒以内。",
    "when_not_to_use": [
      "Applications with low or predictable traffic that do not justify the operational complexity of distributed in-memory grids",
      "Systems requiring strong transactional consistency where eventual consistency is unacceptable",
      "Budget-constrained projects where the high memory and infrastructure costs of in-memory data grids are prohibitive"
    ],
    "when_not_to_use_zh": [
      "流量低或可预测的应用，不足以证明分布式内存网格的运维复杂度",
      "需要强事务一致性、最终一致性不可接受的系统",
      "预算有限的项目，内存数据网格的高内存和基础设施成本难以承受"
    ],
    "adopters": [
      "Ticketmaster",
      "GigaSpaces",
      "Bloomberg",
      "Goldman Sachs",
      "eBay"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "scalability",
      "performance"
    ],
    "maturity_ring": "established",
    "primary_source": "Mark Richards (2015). \"Software Architecture Patterns\". O'Reilly Media.",
    "secondary_sources": [
      "Mark Richards and Neal Ford (2020). \"Fundamentals of Software Architecture\". O'Reilly Media.",
      "Nati Shalom (2006). \"Space-Based Architecture and the End of Tier-Based Computing\". GigaSpaces."
    ],
    "typed_relations": [
      {
        "slug": "microservices-decomposition",
        "type": "complement"
      },
      {
        "slug": "eda",
        "type": "related"
      },
      {
        "slug": "cqrs-pattern",
        "type": "related"
      }
    ]
  },
  {
    "id": 170,
    "name": "Pipe and Filter Architecture",
    "name_zh": "管道与过滤器架构",
    "slug": "pipe-and-filter",
    "category": "architecture",
    "desc": "Architecture pattern that decomposes data processing into independent, composable stages connected by data channels",
    "desc_zh": "将数据处理分解为通过数据通道连接的独立可组合阶段的架构模式",
    "steps": [
      "Identify the end-to-end data processing workflow and break it into discrete transformation stages (filters)",
      "Define the data format for pipes: establish a uniform or compatible data contract between filters",
      "Implement each filter as a self-contained component that reads from an input pipe, transforms data, and writes to an output pipe",
      "Connect filters with pipes (queues, streams, or in-memory channels) to form a processing pipeline",
      "Add monitoring, error handling, and backpressure mechanisms to ensure pipeline resilience under varying load"
    ],
    "steps_zh": [
      "识别端到端的数据处理工作流，将其分解为离散的转换阶段（过滤器）",
      "定义管道的数据格式：在过滤器之间建立统一或兼容的数据契约",
      "将每个过滤器实现为自包含的组件，从输入管道读取、转换数据并写入输出管道",
      "使用管道（队列、流或内存通道）连接过滤器，形成处理流水线",
      "添加监控、错误处理和背压机制，确保流水线在不同负载下的韧性"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Source",
      "Filter",
      "Pipe",
      "Sink"
    ],
    "viz_labels_zh": [
      "数据源",
      "过滤器",
      "管道",
      "数据汇"
    ],
    "related": [
      "eda",
      "space-based-architecture",
      "layered-architecture"
    ],
    "tags": [
      "data-processing",
      "composability",
      "streaming",
      "pipeline",
      "transformation"
    ],
    "origin_author": "Doug McIlroy, 1964; formalized by Mary Shaw and David Garlan, 1996",
    "origin_source": "Software Architecture: Perspectives on an Emerging Discipline (Shaw & Garlan, Prentice Hall, 1996)",
    "origin_source_zh": "《软件架构：一门新兴学科的观点》（Shaw和Garlan，Prentice Hall，1996年）",
    "complexity": "intermediate",
    "when_to_use": [
      "When data must flow through a series of transformation, validation, or enrichment steps in a defined sequence",
      "When individual processing stages need to be developed, tested, and scaled independently",
      "When building ETL pipelines, data ingestion systems, or stream processing applications",
      "When processing logic benefits from reusable, composable components that can be rearranged into different pipelines"
    ],
    "when_to_use_zh": [
      "当数据必须按定义的顺序经过一系列转换、验证或富化步骤时",
      "当各处理阶段需要独立开发、测试和扩展时",
      "当构建ETL管道、数据摄取系统或流处理应用时",
      "当处理逻辑受益于可重用、可组合、可重新排列到不同管道中的组件时"
    ],
    "core_concepts": [
      "Filter: An independent processing component that receives input, applies a transformation, and produces output without knowledge of adjacent filters",
      "Pipe: A connector that transports data between filters, which can be synchronous (function calls) or asynchronous (message queues)",
      "Composability: Filters can be rearranged, added, or removed to create new processing workflows without modifying existing components",
      "Backpressure: A flow control mechanism that prevents fast producers from overwhelming slow consumers in the pipeline",
      "Fan-out and fan-in: Patterns for splitting a pipeline into parallel branches and merging results back together"
    ],
    "core_concepts_zh": [
      "过滤器：接收输入、应用转换并产生输出的独立处理组件，不感知相邻过滤器",
      "管道：在过滤器之间传输数据的连接器，可以是同步的（函数调用）或异步的（消息队列）",
      "可组合性：过滤器可以重新排列、添加或移除以创建新的处理工作流，而无需修改现有组件",
      "背压：防止快速生产者压垮管道中慢速消费者的流控机制",
      "扇出与扇入：将管道分裂为并行分支并将结果合并回来的模式"
    ],
    "timeline": [
      [
        "1964",
        "Doug McIlroy proposes the concept of connecting programs like garden hoses at Bell Labs"
      ],
      [
        "1973",
        "Unix pipes implement the pipe-and-filter pattern at the operating system level"
      ],
      [
        "1996",
        "Shaw and Garlan formalize pipe-and-filter as a fundamental architectural style"
      ],
      [
        "2011",
        "Apache Kafka popularizes distributed streaming pipelines at scale"
      ],
      [
        "2020",
        "Richards and Ford analyze pipe-and-filter trade-offs for modern distributed architectures in Fundamentals of Software Architecture"
      ]
    ],
    "timeline_zh": [
      [
        "1964",
        "Doug McIlroy在贝尔实验室提出像连接花园水管一样连接程序的概念"
      ],
      [
        "1973",
        "Unix管道在操作系统层面实现管道与过滤器模式"
      ],
      [
        "1996",
        "Shaw和Garlan将管道与过滤器正式定义为基本架构风格"
      ],
      [
        "2011",
        "Apache Kafka将分布式流式管道大规模普及"
      ],
      [
        "2020",
        "Richards和Ford在《软件架构基础》中分析现代分布式架构的管道与过滤器权衡"
      ]
    ],
    "dos": [
      "Do keep filters stateless because stateless filters can be parallelized, restarted, and scaled independently",
      "Do define a uniform data format between filters because incompatible formats require costly adapter layers",
      "Do implement backpressure handling because without it, pipeline failures cascade from the slowest filter outward",
      "Do make filters idempotent because retries after failure should not produce duplicate or corrupted results"
    ],
    "dos_zh": [
      "保持过滤器无状态，因为无状态过滤器可以被并行化、重启和独立扩展",
      "定义过滤器之间的统一数据格式，因为不兼容的格式需要昂贵的适配器层",
      "实现背压处理，因为没有背压时管道故障会从最慢的过滤器向外级联",
      "使过滤器具有幂等性，因为失败后的重试不应产生重复或损坏的结果"
    ],
    "donts": [
      "Don't share state between filters through side channels because it creates hidden coupling and makes the pipeline non-deterministic",
      "Don't create overly fine-grained filters because the overhead of serialization and transport between too many stages degrades throughput",
      "Don't ignore error handling in intermediate filters because a silent failure mid-pipeline corrupts all downstream data",
      "Don't build circular pipelines because feedback loops in filter chains create debugging nightmares and potential infinite loops"
    ],
    "donts_zh": [
      "不要通过旁路在过滤器间共享状态，因为这会造成隐性耦合并使管道不确定",
      "不要创建过于细粒度的过滤器，因为太多阶段间的序列化和传输开销会降低吞吐量",
      "不要忽略中间过滤器的错误处理，因为管道中部的静默故障会损坏所有下游数据",
      "不要构建循环管道，因为过滤器链中的反馈循环会造成调试噩梦和潜在的无限循环"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix's data pipeline platform processes over a trillion events per day using a pipe-and-filter architecture built on Apache Kafka and Apache Flink. Each filter stage handles a specific responsibility: ingestion, deduplication, enrichment with user metadata, aggregation for analytics, and routing to downstream data stores. The composability of the pipeline allows Netflix to add new filters (such as real-time anomaly detection) without disrupting existing processing stages. This architecture processes petabytes of streaming data daily with sub-second latency for real-time analytics.",
    "case_study_zh": "Netflix的数据管道平台使用基于Apache Kafka和Apache Flink构建的管道与过滤器架构，每天处理超过一万亿个事件。每个过滤器阶段处理特定职责：摄取、去重、用户元数据富化、分析聚合以及路由到下游数据存储。管道的可组合性允许Netflix在不干扰现有处理阶段的情况下添加新过滤器（如实时异常检测）。该架构每天处理PB级的流式数据，实时分析延迟不到一秒。",
    "when_not_to_use": [
      "Interactive applications requiring synchronous request-response patterns where pipeline latency is unacceptable",
      "Systems where processing steps are tightly interdependent and cannot operate on data independently",
      "Small applications where the overhead of defining pipes and filters exceeds the benefit of modularity"
    ],
    "when_not_to_use_zh": [
      "需要同步请求-响应模式、管道延迟不可接受的交互式应用",
      "处理步骤紧密相互依赖、无法独立操作数据的系统",
      "定义管道和过滤器的开销超过模块化收益的小型应用"
    ],
    "adopters": [
      "Netflix",
      "LinkedIn",
      "Uber",
      "Confluent",
      "Spotify"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability",
      "portability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Mary Shaw and David Garlan (1996). \"Software Architecture: Perspectives on an Emerging Discipline\". Prentice Hall.",
    "secondary_sources": [
      "Doug McIlroy (1964). \"Summary of Unix Philosophy\". Bell System Technical Journal.",
      "Frank Buschmann et al. (1996). \"Pattern-Oriented Software Architecture, Volume 1: A System of Patterns\". Wiley."
    ],
    "typed_relations": [
      {
        "slug": "eda",
        "type": "complement"
      },
      {
        "slug": "space-based-architecture",
        "type": "related"
      },
      {
        "slug": "layered-architecture",
        "type": "related"
      }
    ]
  },
  {
    "id": 171,
    "name": "Architecture Fitness Functions",
    "name_zh": "架构适应度函数",
    "slug": "architecture-fitness-functions",
    "category": "architecture",
    "desc": "Automated, objective compliance checks that continuously validate whether an architecture meets its defined characteristics",
    "desc_zh": "自动化的客观合规检查，持续验证架构是否满足其定义的特征",
    "steps": [
      "Identify the critical architecture characteristics (e.g., modularity, performance, security) that must be preserved as the system evolves",
      "For each characteristic, define a measurable fitness function with a clear threshold or acceptance criterion",
      "Implement the fitness functions as automated tests: unit-level (dependency checks), integration-level (latency budgets), or holistic (deployment frequency)",
      "Integrate fitness functions into the CI/CD pipeline so they run automatically on every build or deployment",
      "Review and update fitness functions regularly as architecture goals evolve, removing obsolete checks and adding new ones"
    ],
    "steps_zh": [
      "识别系统演进中必须保持的关键架构特征（如模块性、性能、安全性）",
      "为每个特征定义可度量的适应度函数，带有明确的阈值或验收标准",
      "将适应度函数实现为自动化测试：单元级（依赖检查）、集成级（延迟预算）或全局级（部署频率）",
      "将适应度函数集成到CI/CD管道中，使其在每次构建或部署时自动运行",
      "随着架构目标的演进定期审查和更新适应度函数，移除过时的检查并添加新的"
    ],
    "ai_relevant": true,
    "viz_type": "radar",
    "viz_labels": [
      "Coupling",
      "Cohesion",
      "Performance",
      "Security",
      "Drift"
    ],
    "viz_labels_zh": [
      "耦合度",
      "内聚度",
      "性能",
      "安全",
      "架构漂移"
    ],
    "related": [
      "adr",
      "atam",
      "continuous-architecture"
    ],
    "tags": [
      "governance",
      "automation",
      "compliance",
      "evolutionary-architecture",
      "metrics"
    ],
    "origin_author": "Neal Ford, Rebecca Parsons, Patrick Kua, 2017",
    "origin_source": "Building Evolutionary Architectures (O'Reilly, Ford, Parsons & Kua, 2017)",
    "origin_source_zh": "《演进式架构》（O'Reilly，Ford、Parsons和Kua，2017年）",
    "complexity": "intermediate",
    "when_to_use": [
      "When architectural principles need to be enforced automatically because manual reviews are too slow or inconsistent",
      "When a system is evolving rapidly and there is a risk of architectural drift as new features are added",
      "When multiple teams contribute to a shared codebase and consistent architectural standards must be maintained",
      "When migrating or modernizing a system and you need guardrails to ensure the target architecture is being achieved"
    ],
    "when_to_use_zh": [
      "当架构原则需要自动执行，因为人工评审太慢或不一致时",
      "当系统快速演进，新功能增加时存在架构漂移风险时",
      "当多个团队贡献同一代码库，需要维持一致的架构标准时",
      "当迁移或现代化系统时，需要护栏确保目标架构正在实现"
    ],
    "core_concepts": [
      "Fitness function: An objective, automated assessment that measures how well the architecture meets a specific characteristic",
      "Atomic fitness function: Tests a single architecture dimension in isolation, such as cyclic dependency detection or response time",
      "Holistic fitness function: Evaluates a combination of architecture characteristics together, such as deployment pipeline throughput",
      "Triggered vs. continuous: Some fitness functions run on each commit (triggered) while others monitor production metrics continuously",
      "Evolutionary architecture: The overarching principle that architecture should support guided, incremental change rather than big upfront design"
    ],
    "core_concepts_zh": [
      "适应度函数：度量架构满足特定特征程度的客观自动化评估",
      "原子适应度函数：孤立测试单一架构维度，如循环依赖检测或响应时间",
      "全局适应度函数：综合评估多个架构特征的组合，如部署管道吞吐量",
      "触发式与持续式：某些适应度函数在每次提交时运行（触发式），另一些持续监控生产指标",
      "演进式架构：架构应支持有引导的增量变更而非大规模预先设计的总体原则"
    ],
    "timeline": [
      [
        "2015",
        "Neal Ford and Rebecca Parsons begin presenting evolutionary architecture concepts at conferences"
      ],
      [
        "2017",
        "Ford, Parsons, and Kua publish Building Evolutionary Architectures introducing fitness functions formally"
      ],
      [
        "2018",
        "ArchUnit (Java) gains popularity as a tool for implementing architectural fitness functions as unit tests"
      ],
      [
        "2020",
        "Richards and Ford incorporate fitness functions into Fundamentals of Software Architecture as a governance mechanism"
      ],
      [
        "2023",
        "Second edition of Building Evolutionary Architectures expands coverage of fitness functions for cloud-native and AI systems"
      ]
    ],
    "timeline_zh": [
      [
        "2015",
        "Neal Ford和Rebecca Parsons开始在会议上演讲演进式架构概念"
      ],
      [
        "2017",
        "Ford、Parsons和Kua出版《演进式架构》，正式引入适应度函数"
      ],
      [
        "2018",
        "ArchUnit（Java）作为将架构适应度函数实现为单元测试的工具流行起来"
      ],
      [
        "2020",
        "Richards和Ford将适应度函数作为治理机制纳入《软件架构基础》"
      ],
      [
        "2023",
        "《演进式架构》第二版扩展了适应度函数在云原生和AI系统中的覆盖"
      ]
    ],
    "dos": [
      "Do start with the most critical architecture characteristics because trying to measure everything at once leads to alert fatigue",
      "Do make fitness functions fast and deterministic because flaky or slow checks undermine developer trust in the pipeline",
      "Do treat fitness function thresholds as living documents because acceptable ranges change as the system matures",
      "Do visualize fitness function trends over time because trends reveal architectural drift before thresholds are breached"
    ],
    "dos_zh": [
      "从最关键的架构特征开始，因为试图同时度量所有东西会导致告警疲劳",
      "使适应度函数快速且确定性，因为不稳定或缓慢的检查会破坏开发者对管道的信任",
      "将适应度函数阈值视为活文档，因为可接受范围会随系统成熟而变化",
      "可视化适应度函数随时间的趋势，因为趋势能在阈值被突破之前揭示架构漂移"
    ],
    "donts": [
      "Don't write fitness functions that are too strict initially because they block development and get disabled rather than refined",
      "Don't rely solely on code-level checks because many architecture characteristics (latency, availability) require runtime measurement",
      "Don't forget to test the fitness functions themselves because a broken fitness function provides false confidence",
      "Don't use fitness functions as a substitute for architecture discussion because they enforce decisions but do not make them"
    ],
    "donts_zh": [
      "初始不要将适应度函数设置得过于严格，因为它们会阻塞开发并被禁用而非优化",
      "不要仅依赖代码级检查，因为许多架构特征（延迟、可用性）需要运行时度量",
      "不要忘记测试适应度函数本身，因为损坏的适应度函数会提供虚假的信心",
      "不要用适应度函数替代架构讨论，因为它们执行决策但不做决策"
    ],
    "case_study_company": "ThoughtWorks",
    "case_study": "ThoughtWorks, the company where fitness functions originated, uses them extensively in client engagements to prevent architectural erosion during long-running modernization projects. On one large financial services client, they implemented fitness functions using ArchUnit to enforce that no module in the new microservices-ready codebase could depend on legacy shared libraries. They also added latency budget fitness functions that failed the build if any API endpoint exceeded 200ms at the p95 level. Over 18 months, these automated checks caught 47 architectural violations that would have otherwise slipped into production unnoticed.",
    "case_study_zh": "ThoughtWorks作为适应度函数的发源公司，在客户项目中广泛使用适应度函数防止长期现代化项目中的架构侵蚀。在一个大型金融服务客户项目中，他们使用ArchUnit实现适应度函数，确保新的微服务就绪代码库中没有模块依赖遗留共享库。他们还添加了延迟预算适应度函数，当任何API端点在p95级别超过200毫秒时构建失败。在18个月内，这些自动化检查捕获了47个架构违规，否则这些违规将悄无声息地进入生产环境。",
    "when_not_to_use": [
      "Very early-stage prototypes where the architecture is intentionally fluid and constraints would slow exploration",
      "Small teams with strong shared understanding where the overhead of automated checks exceeds their governance value",
      "Throwaway systems with a planned short lifespan where architectural longevity is not a concern"
    ],
    "when_not_to_use_zh": [
      "架构有意保持流动性、约束会减慢探索速度的早期原型阶段",
      "具有强烈共识的小型团队，自动化检查的开销超过其治理价值时",
      "计划短生命周期的一次性系统，架构持久性不是关注点时"
    ],
    "adopters": [
      "ThoughtWorks",
      "Netflix",
      "Spotify",
      "Zalando",
      "ING Bank"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Neal Ford, Rebecca Parsons, and Patrick Kua (2017). \"Building Evolutionary Architectures: Support Constant Change\". O'Reilly Media.",
    "secondary_sources": [
      "Neal Ford, Rebecca Parsons, and Patrick Kua (2023). \"Building Evolutionary Architectures, 2nd Edition\". O'Reilly Media.",
      "ThoughtWorks (2017). \"Fitness Functions\". thoughtworks.com/radar/techniques."
    ],
    "typed_relations": [
      {
        "slug": "adr",
        "type": "complement"
      },
      {
        "slug": "atam",
        "type": "related"
      },
      {
        "slug": "continuous-architecture",
        "type": "related"
      }
    ]
  },
  {
    "id": 172,
    "name": "Decision Matrix (Weighted Scoring)",
    "name_zh": "决策矩阵（加权评分法）",
    "slug": "decision-matrix",
    "category": "architecture",
    "desc": "Systematic trade-off evaluation method that scores architecture alternatives against weighted criteria to make objective, transparent decisions",
    "desc_zh": "针对加权标准对架构备选方案进行评分的系统化权衡评估方法，以做出客观透明的决策",
    "steps": [
      "List all candidate architecture alternatives and define the evaluation criteria (e.g., scalability, cost, team expertise, time-to-market)",
      "Assign weights to each criterion based on business priorities, ensuring weights sum to 100% or a consistent scale",
      "Score each alternative against every criterion using a consistent rating scale (e.g., 1-5 or 1-10)",
      "Calculate weighted scores by multiplying each rating by its criterion weight and summing across all criteria per alternative",
      "Analyze the results: review the rankings, perform sensitivity analysis on key weights, and document the decision rationale"
    ],
    "steps_zh": [
      "列出所有候选架构方案并定义评估标准（如可扩展性、成本、团队专业知识、上市时间）",
      "根据业务优先级为每个标准分配权重，确保权重总和为100%或一致的量表",
      "使用一致的评分量表（如1-5或1-10）为每个方案在每个标准上评分",
      "计算加权分数：将每个评分乘以其标准权重，并按方案汇总所有标准的分数",
      "分析结果：审查排名，对关键权重进行敏感性分析，并记录决策理由"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Option",
      "Criteria",
      "Weight",
      "Score"
    ],
    "viz_labels_zh": [
      "方案选项",
      "评估标准",
      "权重",
      "得分"
    ],
    "related": [
      "atam",
      "trade-off-sliders",
      "adr"
    ],
    "tags": [
      "decision-making",
      "trade-offs",
      "evaluation",
      "scoring",
      "prioritization"
    ],
    "origin_author": "Stuart Pugh, 1991; adapted for software architecture by Bass, Clements, and Kazman",
    "origin_source": "Software Architecture in Practice, 4th Edition (Addison-Wesley, Bass, Clements & Kazman, 2021)",
    "origin_source_zh": "《软件架构实践》第四版（Addison-Wesley，Bass、Clements和Kazman，2021年）",
    "complexity": "beginner",
    "when_to_use": [
      "When multiple architecture options exist and stakeholders need a transparent, objective comparison framework",
      "When subjective debates about technology choices are stalling decision-making progress",
      "When the decision involves many criteria with different levels of importance to different stakeholders",
      "When you need an auditable record of how and why a particular architecture was chosen"
    ],
    "when_to_use_zh": [
      "当存在多个架构选项，利益相关者需要透明客观的比较框架时",
      "当关于技术选择的主观争论阻碍了决策进展时",
      "当决策涉及对不同利益相关者具有不同重要性的多个标准时",
      "当需要可审计的记录来说明特定架构是如何以及为何被选中时"
    ],
    "core_concepts": [
      "Criteria definition: Identifying the measurable dimensions against which architecture alternatives will be evaluated",
      "Weight assignment: Quantifying the relative importance of each criterion based on stakeholder priorities and business context",
      "Scoring consistency: Using a uniform rating scale and clear rubrics to ensure scores are comparable across evaluators",
      "Sensitivity analysis: Testing how changes in weights or scores affect the final ranking to identify fragile conclusions",
      "Decision transparency: Making the entire evaluation process visible and reproducible so stakeholders trust the outcome"
    ],
    "core_concepts_zh": [
      "标准定义：识别用于评估架构备选方案的可度量维度",
      "权重分配：根据利益相关者优先级和业务背景量化每个标准的相对重要性",
      "评分一致性：使用统一的评分量表和清晰的评判标准，确保评分在评估者间可比较",
      "敏感性分析：测试权重或分数的变化如何影响最终排名，以识别脆弱的结论",
      "决策透明度：使整个评估过程可见且可重现，以便利益相关者信任结果"
    ],
    "timeline": [
      [
        "1991",
        "Stuart Pugh publishes Total Design introducing the Pugh matrix for engineering decision-making"
      ],
      [
        "2003",
        "Bass, Clements, and Kazman integrate weighted scoring into architecture evaluation in Software Architecture in Practice"
      ],
      [
        "2012",
        "Decision matrices become standard practice in enterprise architecture governance frameworks like TOGAF"
      ],
      [
        "2020",
        "Richards and Ford discuss trade-off analysis techniques including decision matrices in Fundamentals of Software Architecture"
      ],
      [
        "2021",
        "Bass, Clements, and Kazman refine the decision matrix approach in the 4th edition of Software Architecture in Practice"
      ]
    ],
    "timeline_zh": [
      [
        "1991",
        "Stuart Pugh出版《全面设计》，引入Pugh矩阵用于工程决策"
      ],
      [
        "2003",
        "Bass、Clements和Kazman在《软件架构实践》中将加权评分集成到架构评估中"
      ],
      [
        "2012",
        "决策矩阵成为TOGAF等企业架构治理框架中的标准实践"
      ],
      [
        "2020",
        "Richards和Ford在《软件架构基础》中讨论包括决策矩阵在内的权衡分析技术"
      ],
      [
        "2021",
        "Bass、Clements和Kazman在《软件架构实践》第四版中完善决策矩阵方法"
      ]
    ],
    "dos": [
      "Do involve diverse stakeholders in weight assignment because individual bias skews the outcome toward one perspective",
      "Do perform sensitivity analysis on the top-weighted criteria because small weight changes can flip the final ranking",
      "Do document the rationale behind both weights and scores because the reasoning is as valuable as the numbers",
      "Do revisit the decision matrix when major requirements change because stale evaluations lead to suboptimal choices"
    ],
    "dos_zh": [
      "让多元化的利益相关者参与权重分配，因为个人偏见会使结果倾向于某一视角",
      "对权重最高的标准进行敏感性分析，因为权重的微小变化可能翻转最终排名",
      "记录权重和评分背后的理由，因为推理过程与数字同样有价值",
      "当主要需求变更时重新审视决策矩阵，因为过时的评估会导致次优选择"
    ],
    "donts": [
      "Don't let one person assign all weights alone because it introduces unchecked bias into the evaluation",
      "Don't use the matrix to justify a predetermined conclusion because stakeholders will lose trust in the process",
      "Don't over-refine scores with false precision because the difference between a 7.2 and a 7.3 is meaningless noise",
      "Don't ignore qualitative factors that resist scoring because some critical considerations (team morale, organizational politics) are hard to quantify"
    ],
    "donts_zh": [
      "不要让一个人单独分配所有权重，因为这会在评估中引入未经检验的偏见",
      "不要用矩阵来为预设结论辩护，因为利益相关者会对流程失去信任",
      "不要以虚假精度过度细化分数，因为7.2和7.3之间的差异是无意义的噪声",
      "不要忽视难以评分的定性因素，因为某些关键考量（团队士气、组织政治）很难量化"
    ],
    "case_study_company": "Capital One",
    "case_study": "Capital One used a weighted decision matrix when evaluating its move from on-premise data centers to cloud infrastructure, comparing AWS, Azure, and Google Cloud across 12 criteria including cost, security compliance, developer tooling, managed services breadth, and talent availability. Each criterion was weighted by a cross-functional committee of architects, security officers, and business leaders. The matrix revealed that while Azure scored highest on enterprise integration, AWS dominated on managed services and talent pool. Sensitivity analysis showed the decision was robust across reasonable weight variations. The transparent process helped Capital One justify its AWS-first strategy to regulators and internal stakeholders.",
    "case_study_zh": "Capital One在评估从本地数据中心迁移到云基础设施时使用加权决策矩阵，在12个标准（包括成本、安全合规、开发者工具、托管服务广度和人才可用性）上比较AWS、Azure和Google Cloud。每个标准由架构师、安全官员和业务领导者组成的跨职能委员会分配权重。矩阵显示，虽然Azure在企业集成方面得分最高，但AWS在托管服务和人才储备方面占主导地位。敏感性分析表明该决策在合理的权重变化范围内是稳健的。透明的流程帮助Capital One向监管机构和内部利益相关者证明其「AWS优先」策略的合理性。",
    "when_not_to_use": [
      "When there are fewer than two viable alternatives, making a comparison matrix unnecessary overhead",
      "When the decision is time-critical and the overhead of formal evaluation outweighs the risk of a quick judgment",
      "When all options are roughly equivalent and the decision should be based on team preference or experimentation rather than analysis"
    ],
    "when_not_to_use_zh": [
      "当可行方案少于两个时，比较矩阵成为不必要的开销",
      "当决策时间紧迫，正式评估的开销超过快速判断的风险时",
      "当所有选项大致等价，决策应基于团队偏好或实验而非分析时"
    ],
    "adopters": [
      "Capital One",
      "ThoughtWorks",
      "McKinsey",
      "Accenture",
      "Deloitte"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Stuart Pugh (1991). \"Total Design: Integrated Methods for Successful Product Engineering\". Addison-Wesley.",
    "secondary_sources": [
      "Len Bass, Paul Clements, and Rick Kazman (2021). \"Software Architecture in Practice, 4th Edition\". Addison-Wesley.",
      "Ralph Keeney and Howard Raiffa (1993). \"Decisions with Multiple Objectives: Preferences and Value Tradeoffs\". Cambridge University Press."
    ],
    "typed_relations": [
      {
        "slug": "atam",
        "type": "complement"
      },
      {
        "slug": "trade-off-sliders",
        "type": "related"
      },
      {
        "slug": "adr",
        "type": "related"
      }
    ]
  },
  {
    "id": 261,
    "name": "Cell-Based Architecture",
    "name_zh": "单元化架构",
    "slug": "cell-based-architecture",
    "category": "architecture",
    "desc": "An architectural pattern where a system is divided into independent, self-contained cells that own their data, compute, and network resources, enabling granular scaling, fault isolation, and independent deployability",
    "desc_zh": "将系统划分为独立、自包含的单元（cell）的架构模式，每个单元拥有自己的数据、计算和网络资源，实现细粒度扩展、故障隔离和独立部署能力",
    "steps": [
      "Cell Boundary Definition: Identify natural business or tenant partitions (geographic region, customer segment, product line) that can be served independently and will not share state with other cells",
      "Cell Blueprint Design: Define the standard template for a cell — its services, databases, message queues, API gateway, and internal networking — so every cell is a consistent deployable unit",
      "Cell Router: Build a stateless routing layer (DNS-based, API gateway, or service mesh) that maps incoming requests to the correct cell based on cell affinity (tenant ID, region, user hash)",
      "Cell Lifecycle Automation: Implement infrastructure-as-code templates (Terraform, Helm) to provision, upgrade, scale, and decommission cells independently without coordination across the cell fleet",
      "Observability per Cell: Deploy isolated monitoring stacks per cell (or cell-aware dashboards in a shared platform) so failures, latency spikes, and capacity issues are diagnosed at cell granularity"
    ],
    "steps_zh": [
      "单元边界定义：识别可独立服务且不与其他单元共享状态的自然业务或租户分区（地理区域、客户细分、产品线）",
      "单元蓝图设计：定义单元的标准模板——其服务、数据库、消息队列、API网关和内部网络——使每个单元成为一致的可部署单元",
      "单元路由器：构建无状态路由层（基于DNS、API网关或服务网格），根据单元亲和性（租户ID、区域、用户哈希）将传入请求映射到正确的单元",
      "单元生命周期自动化：实施基础设施即代码模板（Terraform、Helm）以独立地供应、升级、扩展和退役单元，无需跨单元舰队协调",
      "每单元可观测性：为每个单元部署隔离的监控栈（或共享平台中的单元感知仪表板），以便在单元粒度诊断故障、延迟峰值和容量问题"
    ],
    "ai_relevant": true,
    "viz_type": "tree",
    "viz_labels": [
      "Cell",
      "Gateway",
      "Data Plane",
      "Control Plane"
    ],
    "viz_labels_zh": [
      "单元格",
      "网关",
      "数据平面",
      "控制平面"
    ],
    "related": [
      "ports-and-adapters"
    ],
    "tags": [
      "cell-based",
      "fault-isolation",
      "multi-tenant",
      "scalability",
      "wso2",
      "cloud-native"
    ],
    "origin_author": "WSO2",
    "origin_source": "Siriwardena, P. & Fremantle, P. (2018). \"Cell-Based Architecture\". WSO2 White Paper. wso2.com/whitepapers/cell-based-architecture-and-reference-implementation.",
    "origin_source_zh": "Siriwardena, P. & Fremantle, P.（2018）。「单元化架构」。WSO2白皮书。wso2.com/whitepapers/cell-based-architecture-and-reference-implementation。",
    "complexity": "advanced",
    "when_to_use": [
      "In large-scale SaaS platforms serving thousands of tenants where a noisy-neighbor incident in one tenant should not degrade the experience of others",
      "When blast-radius reduction is a primary architectural goal — large distributed systems where a single deployment or failure can cascade across the entire platform",
      "For geographically distributed systems with data-residency requirements (GDPR, China data localization) where cells map to regions with isolated data stores",
      "When different customer segments require different SLAs, resource quotas, or compliance profiles that cannot be enforced in a shared architecture"
    ],
    "when_to_use_zh": [
      "在服务数千个租户的大型SaaS平台中，其中一个租户的嘈杂邻居事件不应降低其他租户的体验",
      "当减小爆炸半径是主要架构目标时——单次部署或故障可能级联到整个平台的大型分布式系统",
      "对于具有数据驻留要求（GDPR、中国数据本地化）的地理分布式系统，其中单元映射到具有隔离数据存储的区域",
      "当不同客户细分需要无法在共享架构中强制执行的不同SLA、资源配额或合规配置文件时"
    ],
    "core_concepts": [
      "Cell: The fundamental deployment unit — a self-contained cluster of services, data stores, and infrastructure that processes a defined subset of the overall traffic",
      "Cell Router: A stateless, highly available layer that routes requests to the appropriate cell based on cell-affinity criteria without knowledge of internal cell topology",
      "Blast Radius Containment: The property that a failure, security incident, or bad deployment within one cell cannot directly affect the operation of other cells",
      "Cell Affinity: The binding between a user, tenant, or request and a specific cell, maintained by the router and preserved across requests to ensure consistent data locality",
      "Cell Blueprint (Golden Path): A version-controlled infrastructure-as-code template defining the canonical structure of a cell, enabling fleet-wide upgrades through template versioning"
    ],
    "core_concepts_zh": [
      "单元：基本部署单元——一个自包含的服务、数据存储和基础设施集群，处理整体流量的定义子集",
      "单元路由器：无状态、高可用的层，根据单元亲和性标准将请求路由到适当的单元，而无需了解内部单元拓扑",
      "爆炸半径遏制：一个单元内的故障、安全事件或错误部署不能直接影响其他单元操作的属性",
      "单元亲和性：用户、租户或请求与特定单元之间的绑定，由路由器维护并在请求间保持，以确保一致的数据局部性",
      "单元蓝图（黄金路径）：定义单元规范结构的版本控制基础设施即代码模板，通过模板版本控制实现整个舰队的升级"
    ],
    "timeline": [
      [
        "2018",
        "WSO2 publishes the Cell-Based Architecture white paper, formalizing the pattern from lessons learned building large-scale API management platforms"
      ],
      [
        "2019",
        "Amazon publishes「Avoiding Fallback in Distributed Systems」, describing shuffle sharding as a blast-radius reduction technique related to cell isolation"
      ],
      [
        "2021",
        "Slack and DoorDash publish engineering blog posts describing cell-based deployments for tenant isolation at scale"
      ],
      [
        "2023",
        "AWS Well-Architected Framework formally incorporates cell-based architecture as a reliability best practice for high-scale workloads"
      ]
    ],
    "timeline_zh": [
      [
        "2018",
        "WSO2发布单元化架构白皮书，从构建大规模API管理平台的经验教训中正式化该模式"
      ],
      [
        "2019",
        "亚马逊发布「避免分布式系统中的回退」，描述洗牌分片作为与单元隔离相关的爆炸半径缩减技术"
      ],
      [
        "2021",
        "Slack和DoorDash发布工程博客文章，描述用于大规模租户隔离的单元化部署"
      ],
      [
        "2023",
        "AWS良好架构框架正式将单元化架构纳入高规模工作负载的可靠性最佳实践"
      ]
    ],
    "dos": [
      "Design cells to be stateless at the routing layer — cell affinity state lives in the router or a shared cell-mapping service, never in cells themselves",
      "Version your cell blueprint separately from the services running inside cells, enabling you to upgrade the cell infrastructure (k8s version, observability agent) independently of application code",
      "Plan for cell rebalancing from the start — as tenant distribution changes over time, you need automated or semi-automated tooling to migrate tenants between cells without downtime",
      "Treat the cell boundary as a security boundary — network policies, IAM roles, and encryption keys should be scoped to individual cells to limit lateral movement in breach scenarios"
    ],
    "dos_zh": [
      "将单元设计为路由层无状态——单元亲和性状态存在于路由器或共享单元映射服务中，而非单元本身",
      "将单元蓝图与在单元内运行的服务分开版本控制，使你能够独立于应用程序代码升级单元基础设施（k8s版本、可观测性代理）",
      "从一开始就规划单元再平衡——随着时间推移租户分布的变化，你需要自动化或半自动化工具在不停机的情况下将租户迁移到不同单元",
      "将单元边界视为安全边界——网络策略、IAM角色和加密密钥应限定于单个单元，以在泄露场景中限制横向移动"
    ],
    "donts": [
      "Don't share databases or message queues across cells because cross-cell data dependencies break the blast-radius guarantee and create cascading failure paths",
      "Don't start with too many small cells — the operational overhead of managing dozens of cells before traffic warrants it consumes engineering capacity better spent elsewhere",
      "Don't build cells that communicate synchronously with each other during request processing because this reintroduces tight coupling and the distributed monolith failures cell-based architecture is designed to prevent",
      "Don't use cell-based architecture when your product has a single tenant or very few customers — the added complexity is not justified until multi-tenancy isolation becomes a real availability or compliance requirement"
    ],
    "donts_zh": [
      "不要跨单元共享数据库或消息队列，因为跨单元数据依赖会破坏爆炸半径保证并创建级联故障路径",
      "不要从太多小单元开始——在流量证明其合理之前管理数十个单元的运营开销会消耗本可更好利用的工程能力",
      "不要构建在请求处理期间相互同步通信的单元，因为这会重新引入紧耦合和单元化架构旨在防止的分布式单体故障",
      "当你的产品有单个租户或极少客户时不要使用单元化架构——在多租户隔离成为真实可用性或合规需求之前，增加的复杂性是不合理的"
    ],
    "case_study_company": "Slack",
    "case_study": "Slack adopted cell-based architecture (which they call \"sharding\") as their user base scaled past 10 million daily active users. Each cell (shard) contains a subset of Slack workspaces and owns all the services needed to serve those workspaces — message storage, real-time WebSocket handlers, search indexing, and notification delivery. When a viral incident caused one workspace to generate 100x normal traffic, the blast radius was limited to the cell containing that workspace, protecting the other 99% of users. Slack's engineering team reported that cell isolation enabled them to conduct disruptive infrastructure upgrades (Kubernetes migration, database engine changes) on individual cells during off-peak hours without maintenance windows affecting all users simultaneously.",
    "case_study_zh": "Slack在日活跃用户超过1000万后采用了单元化架构（他们称之为「分片」）。每个单元（分片）包含Slack工作区的子集，并拥有服务这些工作区所需的所有服务——消息存储、实时WebSocket处理程序、搜索索引和通知传递。当一个病毒式事件导致一个工作区产生100倍正常流量时，爆炸半径仅限于包含该工作区的单元，保护了其他99%的用户。Slack的工程团队报告称，单元隔离使他们能够在非高峰时段对单个单元进行破坏性基础设施升级（Kubernetes迁移、数据库引擎更改），而不会影响所有用户同时进入维护窗口。",
    "case_study_challenge": "As Slack scaled past 10 million daily active users, a single viral workspace could generate 100x normal traffic and threaten availability for the entire platform. Infrastructure upgrades required company-wide maintenance windows because every workspace shared the same service instances.",
    "case_study_challenge_zh": "当Slack的日活跃用户突破1000万时，一个病毒式传播的工作区可能产生100倍的正常流量，威胁整个平台的可用性。基础设施升级需要全公司范围的维护窗口，因为所有工作区共享相同的服务实例。",
    "case_study_approach": "Slack partitioned its infrastructure into independent cells (which they call shards), each owning a subset of workspaces and all supporting services — message storage, real-time WebSocket handlers, search indexing, and notification delivery. A routing layer maps each workspace to its cell, creating complete blast-radius isolation.",
    "case_study_approach_zh": "Slack将基础设施划分为独立的单元（他们称之为分片），每个单元拥有一部分工作区及其全部支撑服务——消息存储、实时WebSocket处理器、搜索索引和通知推送。路由层将每个工作区映射到其所属单元，实现完整的爆炸半径隔离。",
    "case_study_result": "When a viral incident hit one workspace, only the cell containing that workspace was affected — the other 99% of users experienced zero impact. Cell isolation also enabled rolling infrastructure upgrades (Kubernetes migration, database engine swaps) on individual cells during off-peak hours, eliminating platform-wide maintenance windows entirely.",
    "case_study_result_zh": "当病毒式事件冲击某个工作区时，只有包含该工作区的单元受到影响——其余99%的用户完全无感。单元隔离还使团队能够在非高峰时段对单个单元滚动执行基础设施升级（Kubernetes迁移、数据库引擎更换），彻底消除了全平台维护窗口。",
    "case_study_quote": "The moment we stopped thinking of our infrastructure as one thing and started thinking of it as many independent things, our availability story changed completely.",
    "case_study_quote_zh": "当我们不再将基础设施视为一个整体，而是看作许多独立个体的那一刻，我们的可用性叙事彻底改变了。",
    "when_not_to_use": [
      "For single-tenant applications or platforms with fewer than a few hundred customers where the operational complexity of managing a cell fleet exceeds any resilience benefit",
      "When your data model requires frequent cross-tenant joins or aggregations that cannot be pre-computed — cell isolation forces data locality that makes cross-cell queries expensive or impossible",
      "Early-stage products where the architecture should optimize for rapid iteration speed rather than operational resilience at scale"
    ],
    "when_not_to_use_zh": [
      "对于单租户应用程序或客户少于几百个的平台，管理单元舰队的运营复杂性超过任何弹性收益",
      "当你的数据模型需要频繁跨租户联接或无法预计算的聚合时——单元隔离强制数据局部性，使跨单元查询变得昂贵或不可能",
      "早期阶段的产品，架构应优化快速迭代速度而非大规模运营弹性"
    ],
    "adopters": [
      "Slack",
      "DoorDash",
      "WSO2",
      "Amazon Web Services",
      "Zalando",
      "Shopify"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability",
      "security"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Siriwardena, P. & Fremantle, P. (2018). \"Cell-Based Architecture\". WSO2. wso2.com/whitepapers/cell-based-architecture-and-reference-implementation.",
    "secondary_sources": [
      "Amazon (2019). \"Avoiding Fallback in Distributed Systems\". amazon.science.",
      "AWS (2023). \"Cell-Based Architecture\". AWS Well-Architected Framework. docs.aws.amazon.com/wellarchitected.",
      "Nygard, M. (2018). \"Release It! Design and Deploy Production-Ready Software\", 2nd ed. Pragmatic Bookshelf."
    ],
    "typed_relations": [
      {
        "slug": "ports-and-adapters",
        "type": "related"
      }
    ]
  },
  {
    "id": 262,
    "name": "Ports and Adapters (Hexagonal Architecture)",
    "name_zh": "端口与适配器（六边形架构）",
    "slug": "ports-and-adapters",
    "category": "architecture",
    "desc": "Alistair Cockburn's architectural pattern that isolates the application core from external technology concerns by defining explicit ports (interfaces) and adapters (technology-specific implementations)",
    "desc_zh": "Alistair Cockburn的架构模式，通过定义明确的端口（接口）和适配器（特定技术实现），将应用核心与外部技术关注点隔离",
    "steps": [
      "Define the Application Core: Identify and isolate the domain logic and use cases into a technology-agnostic inner hexagon that contains no references to databases, HTTP frameworks, or external services",
      "Define Driving Ports: Specify inbound interfaces (use case interfaces, command handlers) that represent the ways the outside world drives the application — these are the left-side ports of the hexagon",
      "Define Driven Ports: Specify outbound interfaces (repository interfaces, notification services, messaging ports) that represent how the application drives external dependencies — right-side ports",
      "Implement Adapters: Write concrete implementations for each port — a REST controller adapter drives via the HTTP port, a JPA repository adapter implements the persistence port — keeping adapters thin and free of business logic",
      "Wire with Dependency Injection: Compose the application by injecting appropriate adapters into the core at startup time, enabling easy swapping of adapters for testing (in-memory adapters) or technology migration (switching databases)"
    ],
    "steps_zh": [
      "定义应用核心：将领域逻辑和用例识别并隔离到一个技术无关的内部六边形中，该六边形不包含对数据库、HTTP框架或外部服务的引用",
      "定义驱动端口：指定入站接口（用例接口、命令处理器），表示外部世界驱动应用程序的方式——这些是六边形的左侧端口",
      "定义被驱动端口：指定出站接口（仓储接口、通知服务、消息端口），表示应用程序驱动外部依赖的方式——右侧端口",
      "实现适配器：为每个端口编写具体实现——REST控制器适配器通过HTTP端口驱动，JPA仓储适配器实现持久化端口——保持适配器精简且不含业务逻辑",
      "使用依赖注入连接：通过在启动时将适当的适配器注入核心来组合应用程序，实现轻松交换适配器用于测试（内存适配器）或技术迁移（更换数据库）"
    ],
    "ai_relevant": false,
    "viz_type": "venn",
    "viz_labels": [
      "Core Domain",
      "Port",
      "Adapter"
    ],
    "viz_labels_zh": [
      "核心领域",
      "端口",
      "适配器"
    ],
    "related": [
      "domain-driven-design",
      "cell-based-architecture",
      "clean-architecture",
      "hexagonal-architecture",
      "onion-architecture",
      "n-tier-layered"
    ],
    "tags": [
      "hexagonal",
      "ports-adapters",
      "clean-architecture",
      "testability",
      "cockburn"
    ],
    "origin_author": "Alistair Cockburn",
    "origin_source": "Cockburn, A. (2005). \"Hexagonal Architecture\". alistair.cockburn.us/hexagonal-architecture.",
    "origin_source_zh": "Cockburn, A.（2005）。「六边形架构」。alistair.cockburn.us/hexagonal-architecture。",
    "complexity": "intermediate",
    "when_to_use": [
      "When the application needs to be driven from multiple clients (HTTP API, CLI, message queue consumer, test harness) without duplicating business logic across each entry point",
      "For applications with domain logic that must be thoroughly unit-tested in isolation from databases, external APIs, and messaging infrastructure",
      "When the technology stack is likely to evolve (migrating from REST to gRPC, switching databases) and the business logic must remain stable during infrastructure changes",
      "In DDD-oriented projects where protecting the domain model from infrastructure concerns is a first-class design objective"
    ],
    "when_to_use_zh": [
      "当应用程序需要由多个客户端（HTTP API、CLI、消息队列消费者、测试工具）驱动而不在每个入口点重复业务逻辑时",
      "对于必须在隔离数据库、外部API和消息基础设施的情况下彻底单元测试领域逻辑的应用程序",
      "当技术栈可能演变（从REST迁移到gRPC、更换数据库）且业务逻辑必须在基础设施变更期间保持稳定时",
      "在以DDD为导向的项目中，保护领域模型免受基础设施关注点的影响是一等设计目标"
    ],
    "core_concepts": [
      "Port: A formally defined interface representing an interaction point between the application core and the outside world; driving ports are inbound (triggered by external actors), driven ports are outbound (triggered by the application)",
      "Adapter: A concrete implementation of a port that translates between the application's domain language and the specific technology protocol (HTTP, SQL, AMQP, gRPC)",
      "Application Core (Hexagon): The technology-agnostic center containing domain objects, use case orchestration, and business rules; it has zero knowledge of adapter implementations",
      "Primary/Driving Side: Actors that initiate interactions — users, API consumers, scheduled jobs — connected to the application through driving adapters and ports",
      "Secondary/Driven Side: Actors that the application drives — databases, external APIs, email services — connected through driven ports and adapters that the application core defines"
    ],
    "core_concepts_zh": [
      "端口：代表应用核心与外部世界交互点的正式定义接口；驱动端口是入站的（由外部参与者触发），被驱动端口是出站的（由应用程序触发）",
      "适配器：端口的具体实现，在应用程序的领域语言和特定技术协议（HTTP、SQL、AMQP、gRPC）之间进行转换",
      "应用核心（六边形）：包含领域对象、用例编排和业务规则的技术无关中心；它对适配器实现毫不知情",
      "主要/驱动侧：发起交互的参与者——用户、API消费者、定时任务——通过驱动适配器和端口连接到应用程序",
      "次要/被驱动侧：应用程序驱动的参与者——数据库、外部API、电子邮件服务——通过应用核心定义的被驱动端口和适配器连接"
    ],
    "timeline": [
      [
        "2005",
        "Alistair Cockburn publishes「Hexagonal Architecture」on his personal website, coining the term Ports and Adapters"
      ],
      [
        "2008",
        "Robert C. Martin proposes Clean Architecture, which explicitly credits Ports and Adapters as one of its foundational inspirations alongside Onion Architecture"
      ],
      [
        "2013",
        "The pattern gains wide adoption in the JVM ecosystem through frameworks like Spring Boot and Axon Framework that support hexagonal structuring"
      ],
      [
        "2017",
        "The DDD community standardizes Ports and Adapters as the canonical infrastructure boundary pattern in Domain-Driven Design implementations"
      ]
    ],
    "timeline_zh": [
      [
        "2005",
        "Alistair Cockburn在其个人网站上发布「六边形架构」，创造了「端口与适配器」一词"
      ],
      [
        "2008",
        "Robert C. Martin提出整洁架构，明确将端口与适配器与洋葱架构一起列为其基础灵感之一"
      ],
      [
        "2013",
        "该模式通过支持六边形结构的Spring Boot和Axon Framework等框架在JVM生态系统中获得广泛采用"
      ],
      [
        "2017",
        "DDD社区将端口与适配器标准化为领域驱动设计实现中的规范基础设施边界模式"
      ]
    ],
    "dos": [
      "Define port interfaces in the domain layer (inner hexagon) and have adapters depend on those interfaces — the dependency arrow always points inward",
      "Keep adapters thin and free of business logic — an adapter should do nothing more than translate, marshal, and delegate; all decisions live in the core",
      "Write unit tests against the application core using in-memory adapters — this is the primary payoff of the pattern, enabling fast, reliable tests with no I/O",
      "Name ports by their business intent (OrderRepository, PaymentGateway) rather than by technology (PostgresRepository, StripeClient) to preserve technology independence"
    ],
    "dos_zh": [
      "在领域层（内部六边形）中定义端口接口，让适配器依赖这些接口——依赖箭头始终指向内部",
      "保持适配器精简且不含业务逻辑——适配器只应进行转换、编组和委托；所有决策都在核心中",
      "使用内存适配器对应用核心编写单元测试——这是该模式的主要回报，实现无I/O的快速可靠测试",
      "按业务意图（OrderRepository、PaymentGateway）而非技术（PostgresRepository、StripeClient）命名端口，以保持技术独立性"
    ],
    "donts": [
      "Don't let domain objects leak infrastructure types (JPA annotations, ORM entity markers) — this couples the core to specific technologies and defeats the isolation goal",
      "Don't create one adapter per class — adapters are boundary implementations, not a one-to-one mapping with every service or repository; group by technology context",
      "Don't over-apply the pattern to simple CRUD services where the business logic is trivial and the ceremony of ports/adapters adds structural complexity with no isolation benefit",
      "Don't confuse Ports and Adapters with layered (n-tier) architecture — the key difference is the inversion of dependency direction; in layers, business logic depends on data access; in hexagonal, data access depends on business-defined ports"
    ],
    "donts_zh": [
      "不要让领域对象泄漏基础设施类型（JPA注解、ORM实体标记）——这会将核心耦合到特定技术，破坏隔离目标",
      "不要为每个类创建一个适配器——适配器是边界实现，而非与每个服务或仓储的一对一映射；按技术上下文分组",
      "不要过度将该模式应用于简单CRUD服务，其中业务逻辑微不足道，端口/适配器的形式主义增加结构复杂性而无隔离收益",
      "不要将端口与适配器与分层（n层）架构混淆——关键区别在于依赖方向的反转；在分层中，业务逻辑依赖数据访问；在六边形中，数据访问依赖业务定义的端口"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix applied Hexagonal Architecture principles to their recommendation engine infrastructure, a system that must be driven by multiple clients (A/B test harness, real-time API calls, batch precomputation jobs) and must integrate with multiple downstream systems (feature store, serving layer, experiment platform). By defining explicit driving ports (RecommendationUseCase interface) and driven ports (FeatureStore, ModelRegistry, ServingLayer), the recommendation team was able to swap the underlying ML model serving infrastructure from their own homegrown system to TorchServe without touching the core recommendation logic. The in-memory adapter pattern enabled them to run the full recommendation logic in unit tests 200x faster than integration tests against real infrastructure, accelerating their experimentation cycle.",
    "case_study_zh": "Netflix将六边形架构原则应用于其推荐引擎基础设施，该系统必须由多个客户端（A/B测试工具、实时API调用、批量预计算作业）驱动，并必须与多个下游系统（特征存储、服务层、实验平台）集成。通过定义明确的驱动端口（RecommendationUseCase接口）和被驱动端口（FeatureStore、ModelRegistry、ServingLayer），推荐团队能够将底层ML模型服务基础设施从自研系统切换到TorchServe，而无需触及核心推荐逻辑。内存适配器模式使他们能够在单元测试中运行完整推荐逻辑，速度比针对真实基础设施的集成测试快200倍，加速了实验周期。",
    "when_not_to_use": [
      "For simple CRUD microservices where the domain logic is essentially the database schema — the overhead of ports, adapters, and dependency injection configuration adds friction without testability or flexibility benefit",
      "In event-sourcing architectures where the persistence model fundamentally shapes the domain model — the forced separation can work against the grain of event-sourced design",
      "For extremely performance-sensitive hot paths where the indirection layers of interface dispatch and adapter translation add measurable overhead in latency-critical code"
    ],
    "when_not_to_use_zh": [
      "对于简单CRUD微服务，领域逻辑本质上就是数据库模式——端口、适配器和依赖注入配置的开销增加了摩擦，而没有可测试性或灵活性收益",
      "在事件溯源架构中，持久化模型从根本上塑造领域模型——强制分离可能与事件溯源设计的自然方向相悖",
      "对于极其性能敏感的热路径，接口分发和适配器转换的间接层在延迟关键代码中增加可测量的开销"
    ],
    "adopters": [
      "Netflix",
      "Spotify",
      "Zalando",
      "Sky UK",
      "Thoughtworks",
      "BBVA"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability",
      "testability"
    ],
    "maturity_ring": "established",
    "primary_source": "Cockburn, A. (2005). \"Hexagonal Architecture\". alistair.cockburn.us/hexagonal-architecture.",
    "secondary_sources": [
      "Martin, R. C. (2017). \"Clean Architecture: A Craftsman's Guide to Software Structure and Design\". Prentice Hall.",
      "Vernon, V. (2013). \"Implementing Domain-Driven Design\". Addison-Wesley.",
      "Hombergs, T. (2019). \"Get Your Hands Dirty on Clean Architecture\". Leanpub."
    ],
    "typed_relations": [
      {
        "slug": "domain-driven-design",
        "type": "complement"
      },
      {
        "slug": "cell-based-architecture",
        "type": "related"
      },
      {
        "slug": "clean-architecture",
        "type": "alternative"
      },
      {
        "slug": "hexagonal-architecture",
        "type": "alternative"
      },
      {
        "slug": "onion-architecture",
        "type": "alternative"
      },
      {
        "slug": "n-tier-layered",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 263,
    "name": "Evolutionary Architecture",
    "name_zh": "演进式架构",
    "slug": "evolutionary-architecture",
    "category": "architecture",
    "desc": "Neal Ford's approach to designing software systems that support incremental, guided change across all dimensions through fitness functions and architectural coupling analysis",
    "desc_zh": "Neal Ford提出的软件系统设计方法，通过适应度函数和架构耦合分析支持在所有维度上的增量、有指导的变化",
    "steps": [
      "Define Fitness Functions: Identify measurable criteria (deployment frequency, coupling metrics, test coverage, performance SLAs, security scan thresholds) that define what「fit」means for your architecture, and automate their evaluation in CI/CD",
      "Identify Architectural Coupling Dimensions: Map afferent and efferent coupling across modules, measure cyclomatic complexity, identify inappropriate coupling between business domains that resist evolutionary change",
      "Establish Incremental Change Mechanisms: Implement techniques enabling safe incremental change — feature flags, strangler fig pattern, branch by abstraction, parallel runs — that allow evolution without big-bang rewrites",
      "Guided Change with Fitness Functions: As features are added or architecture evolves, fitness functions serve as automated guardrails — a failed fitness function in CI is an architectural regression, not just a test failure",
      "Continuous Architecture Review: Replace periodic architecture review boards with continuous automated fitness function monitoring and lightweight architecture decision records (ADRs) that capture the 「why」behind each evolutionary step"
    ],
    "steps_zh": [
      "定义适应度函数：识别定义架构「适合」含义的可测量标准（部署频率、耦合指标、测试覆盖率、性能SLA、安全扫描阈值），并在CI/CD中自动化评估",
      "识别架构耦合维度：映射模块间的传入和传出耦合，测量圈复杂度，识别阻碍演进变化的业务领域间不当耦合",
      "建立增量变更机制：实施支持安全增量变化的技术——特性标志、绞杀者模式、通过抽象分支、并行运行——允许演进而无需大爆炸式重写",
      "用适应度函数指导变更：随着功能添加或架构演进，适应度函数充当自动化护栏——CI中失败的适应度函数是架构退化，而非仅仅是测试失败",
      "持续架构审查：用持续的自动化适应度函数监控和轻量级架构决策记录（ADR）取代定期架构审查委员会，捕获每个演进步骤背后的「原因」"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Fitness Function",
      "Incremental Change",
      "Coupling",
      "Versioning"
    ],
    "viz_labels_zh": [
      "适应度函数",
      "增量变更",
      "架构耦合",
      "版本管理"
    ],
    "related": [
      "domain-driven-design",
      "ports-and-adapters"
    ],
    "tags": [
      "evolutionary",
      "fitness-function",
      "continuous-architecture",
      "neal-ford",
      "thoughtworks"
    ],
    "origin_author": "Neal Ford",
    "origin_source": "Ford, N., Parsons, R. & Kua, P. (2017). \"Building Evolutionary Architectures\". O'Reilly Media.",
    "origin_source_zh": "Ford, N., Parsons, R. & Kua, P.（2017）。「构建演进式架构」。O'Reilly Media。",
    "complexity": "advanced",
    "when_to_use": [
      "For long-lived systems (5+ years) where the requirements, technology landscape, and organizational structure will inevitably change in ways that cannot be fully anticipated at design time",
      "In organizations practicing continuous delivery where the architecture must accommodate frequent, small-batch changes without accumulating structural debt",
      "When transitioning from monolith to microservices and needing structured, measured migration paths rather than speculative big-bang decomposition",
      "For platform engineering teams building internal developer platforms where fitness functions codify architectural standards that all consuming teams must meet"
    ],
    "when_to_use_zh": [
      "对于长寿命系统（5年以上），需求、技术格局和组织结构将不可避免地以设计时无法完全预料的方式变化",
      "在实践持续交付的组织中，架构必须适应频繁的小批量变更而不积累结构性债务",
      "在从单体迁移到微服务时，需要结构化、可测量的迁移路径，而非推测性的大爆炸式分解",
      "对于构建内部开发平台的平台工程团队，适应度函数编码了所有消费团队必须满足的架构标准"
    ],
    "core_concepts": [
      "Fitness Function: An objective, automated metric that evaluates one architectural characteristic (performance, security, coupling, deployability) — analogous to a genetic algorithm's fitness function, determining whether a change improves or degrades architecture",
      "Architectural Coupling: The degree to which components are bound together in ways that prevent independent evolution — high afferent coupling (many dependents) and inappropriate cross-domain coupling are primary evolutionary obstacles",
      "Incremental Change: The principle that architectural change should happen in small, validated steps with each step evaluated against fitness functions, preventing the accumulation of unvalidated architectural debt",
      "Cyclic Dependencies: Dependency graphs where A depends on B which depends on A, the most destructive coupling pattern for evolutionary architectures because it forces coordinated deployment and prevents independent evolution",
      "Architecture Quantum: The smallest independently deployable unit that includes all the structural elements required for the system to function — identifying quanta reveals the natural boundaries for evolutionary decomposition"
    ],
    "core_concepts_zh": [
      "适应度函数：评估一个架构特征（性能、安全、耦合、可部署性）的客观、自动化指标——类似于遗传算法的适应度函数，确定变更是否改善或降低架构",
      "架构耦合：组件以阻止独立演进的方式绑定在一起的程度——高传入耦合（多个依赖方）和不当的跨领域耦合是主要的演进障碍",
      "增量变更：架构变更应以小的、经验证的步骤发生，每步根据适应度函数评估，防止未经验证的架构债务积累",
      "循环依赖：A依赖B而B又依赖A的依赖图，是演进式架构最具破坏性的耦合模式，因为它强制协调部署并阻止独立演进",
      "架构量子：包含系统运行所需所有结构元素的最小可独立部署单元——识别量子揭示了演进式分解的自然边界"
    ],
    "timeline": [
      [
        "2015",
        "Neal Ford and Rebecca Parsons first present the Evolutionary Architecture concept and fitness function idea at O'Reilly Software Architecture Conference"
      ],
      [
        "2017",
        "Ford, Parsons, and Kua publish「Building Evolutionary Architectures」, providing the first comprehensive treatment of the approach"
      ],
      [
        "2020",
        "Thoughtworks Technology Radar promotes fitness functions to「Adopt」, reflecting widespread industry uptake of the automated architecture governance concept"
      ],
      [
        "2022",
        "Second edition of「Building Evolutionary Architectures」published with expanded coverage of microservices fitness functions and platform engineering integration"
      ]
    ],
    "timeline_zh": [
      [
        "2015",
        "Neal Ford和Rebecca Parsons在O'Reilly软件架构大会上首次提出演进式架构概念和适应度函数思想"
      ],
      [
        "2017",
        "Ford、Parsons和Kua出版「构建演进式架构」，提供了该方法的第一个全面论述"
      ],
      [
        "2020",
        "Thoughtworks技术雷达将适应度函数提升到「采用」级别，反映了自动化架构治理概念的广泛行业采用"
      ],
      [
        "2022",
        "「构建演进式架构」第二版出版，扩展了微服务适应度函数和平台工程集成的内容"
      ]
    ],
    "dos": [
      "Start fitness functions from existing quality attributes — if you already measure deployment frequency, test coverage, and p99 latency, those are your first fitness functions with minimal new instrumentation",
      "Make fitness functions executable in CI so architectural degradation is caught at the same point as functional regressions — treat an architecture fitness failure as a build-breaking defect",
      "Use coupling visualization tools (Structure101, JDepend, dependency-cruiser) to make the current coupling state visible before planning evolutionary steps",
      "Combine fitness functions with Architecture Decision Records (ADRs) — fitness functions enforce the decision; ADRs explain why the decision was made and what tradeoffs were accepted"
    ],
    "dos_zh": [
      "从现有质量属性开始适应度函数——如果你已经测量部署频率、测试覆盖率和p99延迟，这些就是你的第一批适应度函数，需要最少的新仪表化",
      "使适应度函数在CI中可执行，以便在与功能退化相同的点捕获架构降级——将架构适应度失败视为构建失败的缺陷",
      "使用耦合可视化工具（Structure101、JDepend、dependency-cruiser）在规划演进步骤之前使当前耦合状态可见",
      "将适应度函数与架构决策记录（ADR）结合——适应度函数执行决策；ADR解释为什么做出决策以及接受了哪些权衡"
    ],
    "donts": [
      "Don't design for imagined future requirements — evolutionary architecture means designing for changeability, not speculating about specific changes and over-engineering for them",
      "Don't treat fitness functions as a bureaucratic compliance checklist — each fitness function should protect a concrete architectural property that has caused or could cause a real business problem",
      "Don't attempt a full architecture migration in a single large project — evolutionary architecture demands incremental steps; a multi-year 「we'll rewrite everything」 plan is the antithesis of evolutionary thinking",
      "Don't neglect the organizational dimension — Conway's Law means team structure determines architecture; evolving the architecture without evolving the team topology creates an uphill battle against communication-driven coupling"
    ],
    "donts_zh": [
      "不要为想象中的未来需求进行设计——演进式架构意味着为可变性设计，而非推测特定变化并为其过度设计",
      "不要将适应度函数视为官僚合规清查清单——每个适应度函数都应保护一个具体的架构属性，该属性已经或可能导致真实的业务问题",
      "不要试图在单个大型项目中完成完整的架构迁移——演进式架构要求增量步骤；多年「我们将重写所有内容」的计划是演进思想的对立面",
      "不要忽视组织维度——康威定律意味着团队结构决定架构；在不演进团队拓扑的情况下演进架构会产生与通信驱动耦合的逆流"
    ],
    "case_study_company": "Thoughtworks",
    "case_study": "Thoughtworks applied evolutionary architecture principles to a large European bank's core banking platform migration — a 15-year-old monolith serving 8 million customers. Rather than a big-bang rewrite, the team defined fitness functions covering: deployment frequency (target: daily from quarterly), test coverage (target: >80% from 23%), module coupling (ArchUnit rules preventing cross-domain imports), and performance regression detection (p95 latency gates in the pipeline). The strangler fig pattern incrementally extracted 12 bounded contexts over 18 months. Each extracted service had its own fitness function suite. By month 12, deployment frequency had increased from quarterly to weekly, with daily deployments achieved by month 18. The fitness functions prevented three architectural regressions that would have reintroduced cross-domain coupling in the new microservices.",
    "case_study_zh": "Thoughtworks将演进式架构原则应用于一家大型欧洲银行的核心银行平台迁移——一个为800万客户服务的15年单体系统。团队没有进行大爆炸式重写，而是定义了覆盖以下方面的适应度函数：部署频率（目标：从季度变为每日）、测试覆盖率（目标：从23%提高到>80%）、模块耦合（防止跨领域导入的ArchUnit规则）和性能退化检测（流水线中的p95延迟门控）。绞杀者模式在18个月内逐步提取了12个限界上下文。每个提取的服务都有自己的适应度函数套件。到第12个月，部署频率从季度增加到每周，到第18个月实现了每日部署。适应度函数防止了三次本会在新微服务中重新引入跨领域耦合的架构退化。",
    "when_not_to_use": [
      "For short-lived throwaway systems (prototypes, event-specific apps) where the investment in fitness function infrastructure exceeds the system's expected lifespan",
      "When the primary architectural constraint is raw performance rather than changeability — highly optimized systems often require tight coupling by design and resist the modularity that enables evolution",
      "For very small teams (2-3 developers) where the overhead of defining, instrumenting, and maintaining fitness functions is disproportionate to the coordination benefit they provide"
    ],
    "when_not_to_use_zh": [
      "对于短命的一次性系统（原型、活动特定应用程序），适应度函数基础设施的投资超过系统的预期寿命",
      "当主要架构约束是原始性能而非可变性时——高度优化的系统通常设计上需要紧耦合，并抵制支持演进的模块化",
      "对于非常小的团队（2-3名开发人员），定义、仪表化和维护适应度函数的开销与它们提供的协调收益不成比例"
    ],
    "adopters": [
      "Thoughtworks",
      "Netflix",
      "Spotify",
      "Etsy",
      "AutoTrader UK",
      "HSBC"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "scalability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Ford, N., Parsons, R. & Kua, P. (2017). \"Building Evolutionary Architectures\". O'Reilly Media.",
    "secondary_sources": [
      "Ford, N., Parsons, R., Kua, P. & Sadalage, P. (2022). \"Building Evolutionary Architectures\", 2nd ed. O'Reilly Media.",
      "Thoughtworks (2020). \"Fitness Functions\". Technology Radar. thoughtworks.com/radar.",
      "Newman, S. (2019). \"Monolith to Microservices\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "domain-driven-design",
        "type": "complement"
      },
      {
        "slug": "ports-and-adapters",
        "type": "complement"
      }
    ]
  },
  {
    "id": 316,
    "name": "MVC (Model-View-Controller)",
    "name_zh": "MVC（模型-视图-控制器）",
    "slug": "mvc",
    "category": "architecture",
    "desc": "Separates an application into three interconnected components — Model (data/logic), View (UI), and Controller (input handling) — to decouple presentation from business logic.",
    "desc_zh": "将应用程序分为三个相互关联的组件——模型（数据/逻辑）、视图（界面）和控制器（输入处理）——以解耦表示层与业务逻辑。",
    "steps": [
      "Define the Model layer: data structures, business rules, and persistence logic; the Model notifies observers when its state changes",
      "Define the View layer: all UI rendering and presentation logic, subscribing to Model changes to update the display without containing business logic",
      "Define the Controller layer: intercepts user input, translates it into commands for the Model or View, and orchestrates the interaction flow",
      "Wire the triad together: Controllers reference the Model and View; Views observe the Model; Models have no knowledge of Views or Controllers",
      "Validate separation by ensuring business logic changes require only Model modifications and UI redesigns require only View modifications"
    ],
    "steps_zh": [
      "定义模型层：数据结构、业务规则和持久化逻辑；模型在状态变化时通知观察者",
      "定义视图层：所有UI渲染和表示逻辑，订阅模型变化以更新显示，不包含业务逻辑",
      "定义控制器层：拦截用户输入，将其转化为对模型或视图的命令，并协调交互流程",
      "连接三元组：控制器引用模型和视图；视图观察模型；模型不了解视图或控制器",
      "通过确保业务逻辑变更只需修改模型、UI重设计只需修改视图来验证分离是否彻底"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Model",
      "View",
      "Controller",
      "User Input",
      "Update"
    ],
    "viz_labels_zh": [
      "模型",
      "视图",
      "控制器",
      "用户输入",
      "状态更新"
    ],
    "related": [
      "separation-of-concerns",
      "observer-pattern",
      "solid-principles",
      "mvvm",
      "mvp",
      "flux-unidirectional"
    ],
    "tags": [
      "ui-pattern",
      "separation-of-concerns",
      "component-architecture",
      "presentation-pattern"
    ],
    "origin_author": "Trygve Reenskaug, 1979, Xerox PARC",
    "origin_source": "Reenskaug, T. (1979). Thing-Model-View-Editor — an example from a planningsystem. Xerox PARC technical note.",
    "origin_source_zh": "Reenskaug, T.（1979）。《Thing-Model-View-Editor——规划系统示例》。Xerox PARC技术备忘录。",
    "complexity": "beginner",
    "when_to_use": [
      "When building applications with rich user interfaces that need clean separation between business logic and presentation",
      "When the same underlying data must be rendered in multiple different views simultaneously",
      "When working within frameworks that enforce or encourage MVC structure (Rails, Spring MVC, ASP.NET MVC, Django)",
      "When teams need to work in parallel on UI and backend logic with minimal conflicts"
    ],
    "when_to_use_zh": [
      "构建具有复杂用户界面且需要将业务逻辑与表示层清晰分离的应用程序时",
      "同一底层数据需要同时以多种不同视图呈现时",
      "在强制或鼓励MVC结构的框架（Rails、Spring MVC、ASP.NET MVC、Django）中工作时",
      "团队需要并行开发UI和后端逻辑且冲突最小化时"
    ],
    "core_concepts": [
      "Model: encapsulates application data, business rules, and state; completely independent of the UI layer",
      "View: renders the Model's state visually; ideally contains no logic beyond formatting and display",
      "Controller: mediates between user input and the Model; translates raw events into domain actions",
      "Observer relationship: Views subscribe to Model changes via the Observer pattern to stay synchronized without tight coupling",
      "Unidirectional dependency: Views and Controllers depend on the Model, but the Model depends on neither"
    ],
    "core_concepts_zh": [
      "模型：封装应用程序数据、业务规则和状态，完全独立于UI层",
      "视图：以可视方式呈现模型状态；理想情况下除格式化和显示外不包含任何逻辑",
      "控制器：在用户输入和模型之间进行调解；将原始事件转化为领域操作",
      "观察者关系：视图通过观察者模式订阅模型变化，无需紧耦合即可保持同步",
      "单向依赖：视图和控制器依赖模型，但模型不依赖任何一方"
    ],
    "timeline": [
      [
        "1979",
        "Trygve Reenskaug introduces Model-View-Controller at Xerox PARC while working on Smalltalk-76"
      ],
      [
        "1988",
        "MVC is formalized and published in the Smalltalk-80 system as described by Krasner and Pope"
      ],
      [
        "2004",
        "Ruby on Rails popularizes MVC for web applications, triggering widespread framework adoption"
      ],
      [
        "2009",
        "MVC dominates web frameworks — Spring MVC, ASP.NET MVC, and Django all establish MVC as the standard web pattern"
      ]
    ],
    "timeline_zh": [
      [
        "1979",
        "Trygve Reenskaug在Xerox PARC研究Smalltalk-76期间引入模型-视图-控制器"
      ],
      [
        "1988",
        "MVC在Smalltalk-80系统中由Krasner和Pope正式发表"
      ],
      [
        "2004",
        "Ruby on Rails将MVC普及用于Web应用程序，引发广泛的框架采用浪潮"
      ],
      [
        "2009",
        "MVC主导Web框架——Spring MVC、ASP.NET MVC和Django均将MVC确立为标准Web模式"
      ]
    ],
    "dos": [
      "Keep the Model completely ignorant of the View and Controller so it can be tested in isolation without any UI scaffolding",
      "Place all business validation in the Model, not in Controllers or Views, so that validation is enforced regardless of the entry point",
      "Use thin Controllers that delegate complex logic to the Model or service layer — fat controllers are a sign that business logic has leaked into the wrong layer",
      "Allow multiple Views to observe the same Model so data changes propagate consistently without duplication"
    ],
    "dos_zh": [
      "保持模型完全不了解视图和控制器，使其可以在没有任何UI脚手架的情况下独立测试",
      "将所有业务验证放在模型中而非控制器或视图中，确保无论从哪个入口点都强制执行验证",
      "使用精简的控制器，将复杂逻辑委托给模型或服务层——臃肿的控制器表明业务逻辑已泄漏到错误的层",
      "允许多个视图观察同一模型，使数据变化能够一致传播而不重复"
    ],
    "donts": [
      "Don't put database queries or business rules directly in Controllers — this creates untestable, framework-coupled logic",
      "Don't let Views directly manipulate Model state — all mutations must flow through the Controller",
      "Don't share mutable state between Views without going through the Model — direct View-to-View communication breaks the pattern",
      "Don't confuse MVC's Controller with the Front Controller pattern — MVC Controllers handle specific user interactions, not all HTTP routing"
    ],
    "donts_zh": [
      "不要将数据库查询或业务规则直接放在控制器中——这会产生无法测试、与框架耦合的逻辑",
      "不要让视图直接操纵模型状态——所有变更必须通过控制器流转",
      "不要在不经过模型的情况下在视图之间共享可变状态——视图间直接通信会破坏该模式",
      "不要将MVC的控制器与前端控制器模式混淆——MVC控制器处理特定的用户交互，而非所有HTTP路由"
    ],
    "case_study_company": "Apple",
    "case_study": "Apple's Cocoa framework for macOS and iOS enforces MVC as its primary application architecture pattern. In Cocoa, NSDocument (Model), NSView (View), and NSViewController (Controller) form the triad. By enforcing strict MVC separation, Apple enabled developers to swap entire UI implementations (e.g., migrating from AppKit to SwiftUI views) without touching model layer code. The iOS UIKit framework's adoption of MVC enabled a thriving ecosystem of third-party apps because developers could share Model code between iPhone and iPad view hierarchies with minimal changes.",
    "case_study_zh": "苹果的Cocoa框架将MVC作为macOS和iOS的主要应用架构模式强制推行。在Cocoa中，NSDocument（模型）、NSView（视图）和NSViewController（控制器）构成三元组。通过强制执行严格的MVC分离，苹果使开发者能够在不修改模型层代码的情况下替换整个UI实现（例如从AppKit视图迁移到SwiftUI视图）。iOS UIKit框架对MVC的采用催生了繁荣的第三方应用生态系统，因为开发者只需少量修改即可在iPhone和iPad视图层次结构之间共享模型代码。",
    "when_not_to_use": [
      "For simple scripts or single-screen utilities where the overhead of three layers adds complexity without benefit",
      "When building highly reactive UIs with complex two-way data binding, where MVVM provides a more natural fit",
      "For purely data-processing backend services with no user interface, where layered or hexagonal architecture is more appropriate"
    ],
    "when_not_to_use_zh": [
      "对于简单脚本或单屏幕工具，三层结构的开销只会增加复杂性而无收益",
      "构建具有复杂双向数据绑定的高度响应式UI时，MVVM提供更自然的契合",
      "对于没有用户界面的纯数据处理后端服务，分层架构或六边形架构更为合适"
    ],
    "adopters": [
      "Apple (Cocoa / UIKit)",
      "Ruby on Rails",
      "Spring MVC",
      "ASP.NET MVC",
      "Django"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability",
      "testability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Reenskaug, T. (1979). Models-Views-Controllers. Xerox PARC technical note, December 10.",
    "secondary_sources": [
      "Krasner, G. & Pope, S. (1988). A Description of the Model-View-Controller User Interface Paradigm in the Smalltalk-80 System. Journal of Object-Oriented Programming, 1(3), 26–49.",
      "Fowler, M. (2006). GUI Architectures. martinfowler.com.",
      "Apple Inc. (2009). Cocoa Application Layer — MVC. Apple Developer Documentation."
    ],
    "typed_relations": [
      {
        "slug": "separation-of-concerns",
        "type": "extends"
      },
      {
        "slug": "observer-pattern",
        "type": "complement"
      },
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "mvvm",
        "type": "alternative"
      },
      {
        "slug": "mvp",
        "type": "alternative"
      },
      {
        "slug": "flux-unidirectional",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 317,
    "name": "MVVM (Model-View-ViewModel)",
    "name_zh": "MVVM（模型-视图-视图模型）",
    "slug": "mvvm",
    "category": "architecture",
    "desc": "Separates UI from business logic by introducing a ViewModel that exposes data streams and commands for two-way data binding, enabling declarative view construction and high testability.",
    "desc_zh": "通过引入ViewModel将UI与业务逻辑分离，ViewModel暴露数据流和命令用于双向数据绑定，从而实现声明式视图构建和高度可测试性。",
    "steps": [
      "Define the Model layer: domain entities, repositories, and business logic with no awareness of the UI",
      "Define the ViewModel: a presentation-layer class that exposes observable properties and commands; it queries the Model and transforms data into a format the View can bind to directly",
      "Define the View: a declarative UI that binds to ViewModel properties using the data-binding framework; the View contains no logic beyond rendering and user gesture forwarding",
      "Establish two-way data binding: ViewModel properties update the View automatically; user interactions in the View invoke ViewModel commands that update the Model",
      "Test the ViewModel in isolation: write unit tests that set ViewModel inputs and assert on output properties without instantiating any UI components"
    ],
    "steps_zh": [
      "定义模型层：领域实体、仓储和业务逻辑，不感知UI",
      "定义视图模型：表示层类，暴露可观察属性和命令；查询模型并将数据转换为视图可直接绑定的格式",
      "定义视图：声明式UI，使用数据绑定框架绑定到视图模型属性；视图除渲染和用户手势转发外不包含任何逻辑",
      "建立双向数据绑定：视图模型属性自动更新视图；视图中的用户交互调用视图模型命令来更新模型",
      "隔离测试视图模型：编写单元测试，设置视图模型输入并对输出属性进行断言，无需实例化任何UI组件"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Model",
      "View",
      "ViewModel",
      "Data Binding",
      "Command"
    ],
    "viz_labels_zh": [
      "模型",
      "视图",
      "视图模型",
      "数据绑定",
      "命令"
    ],
    "related": [
      "separation-of-concerns",
      "observer-pattern",
      "solid-principles",
      "mvc",
      "mvp",
      "flux-unidirectional"
    ],
    "tags": [
      "ui-pattern",
      "data-binding",
      "component-architecture",
      "reactive"
    ],
    "origin_author": "John Gossman, 2005, Microsoft (WPF team)",
    "origin_source": "Gossman, J. (2005). Introduction to Model/View/ViewModel pattern for building WPF apps. Microsoft MSDN blogs, October 8.",
    "origin_source_zh": "Gossman, J.（2005）。《构建WPF应用的模型/视图/视图模型模式介绍》。微软MSDN博客，10月8日。",
    "complexity": "intermediate",
    "when_to_use": [
      "When building rich-client applications (desktop or mobile) with complex UI state that maps naturally to observable data streams",
      "When the testing strategy requires verifying UI behavior without running the full UI stack — ViewModels are plain classes testable in milliseconds",
      "When using a framework with first-class data binding support such as WPF, SwiftUI, Angular, or Vue.js",
      "When the same ViewModel needs to drive multiple Views — e.g., a phone layout and a tablet layout backed by the same presentation logic"
    ],
    "when_to_use_zh": [
      "构建具有复杂UI状态的富客户端应用（桌面或移动），其状态自然映射到可观察数据流时",
      "测试策略要求在不运行完整UI堆栈的情况下验证UI行为时——视图模型是可在毫秒内测试的普通类",
      "使用具有一流数据绑定支持的框架（如WPF、SwiftUI、Angular或Vue.js）时",
      "同一视图模型需要驱动多个视图时——例如，手机布局和平板布局由相同的表示逻辑支撑"
    ],
    "core_concepts": [
      "ViewModel: the pivot of the pattern — a plain, UI-framework-agnostic class that exposes the view's state as observable properties",
      "Data binding: the mechanism by which View elements automatically reflect ViewModel property changes and ViewModel commands receive View events",
      "Command pattern: user actions (button clicks, form submissions) are exposed as command objects on the ViewModel, keeping the View passive",
      "Observable properties: property-change notifications (INotifyPropertyChanged, @Observable, reactive streams) allow the View to react without polling",
      "Separation from Model: the ViewModel adapts Model data for display — formatting, aggregation, filtering — keeping raw domain objects out of the View"
    ],
    "core_concepts_zh": [
      "视图模型：该模式的核心——一个普通的、与UI框架无关的类，将视图状态作为可观察属性暴露",
      "数据绑定：视图元素自动反映视图模型属性变化、视图模型命令接收视图事件的机制",
      "命令模式：用户操作（按钮点击、表单提交）作为命令对象暴露在视图模型上，使视图保持被动",
      "可观察属性：属性变更通知（INotifyPropertyChanged、@Observable、响应流）使视图无需轮询即可响应",
      "与模型分离：视图模型将模型数据适配用于显示——格式化、聚合、过滤——将原始领域对象排除在视图之外"
    ],
    "timeline": [
      [
        "2005",
        "John Gossman of Microsoft's WPF team publishes the MVVM pattern to exploit WPF's data binding engine"
      ],
      [
        "2010",
        "Knockout.js brings MVVM to the browser with observable data binding for JavaScript UIs"
      ],
      [
        "2014",
        "Angular (AngularJS 1.x) and Vue.js popularize MVVM-style two-way binding in web SPAs"
      ],
      [
        "2019",
        "SwiftUI launches on Apple platforms, making MVVM the idiomatic architecture for Swift UI development"
      ]
    ],
    "timeline_zh": [
      [
        "2005",
        "微软WPF团队的John Gossman发布MVVM模式，充分利用WPF的数据绑定引擎"
      ],
      [
        "2010",
        "Knockout.js将MVVM带入浏览器，为JavaScript UI提供可观察数据绑定"
      ],
      [
        "2014",
        "Angular（AngularJS 1.x）和Vue.js在Web单页应用中普及MVVM风格的双向绑定"
      ],
      [
        "2019",
        "SwiftUI在苹果平台发布，使MVVM成为Swift UI开发的惯用架构"
      ]
    ],
    "dos": [
      "Keep ViewModels free of any import from UI frameworks — a ViewModel that imports UIKit or WPF types is immediately harder to unit-test",
      "Expose commands rather than event handlers from the ViewModel so that the View remains a passive, bindable shell",
      "Use one ViewModel per logical screen or component — god ViewModels that serve many unrelated Views accumulate unmanageable state",
      "Validate input inside the ViewModel, not in the View, so validation logic is exercisable in unit tests"
    ],
    "dos_zh": [
      "保持视图模型不导入任何UI框架——导入UIKit或WPF类型的视图模型立即变得难以单元测试",
      "从视图模型暴露命令而非事件处理器，使视图保持为被动的可绑定外壳",
      "每个逻辑屏幕或组件使用一个视图模型——为许多不相关视图服务的上帝视图模型会积累难以管理的状态",
      "在视图模型内部验证输入而非在视图中验证，使验证逻辑可在单元测试中执行"
    ],
    "donts": [
      "Don't reference the View from the ViewModel — this breaks testability and creates circular dependencies",
      "Don't put navigation logic directly in the View — navigation decisions belong in the ViewModel or a dedicated Router so they can be tested",
      "Don't use MVVM in simple screens where a straightforward MVC or direct state management is sufficient — the binding overhead adds complexity without payoff",
      "Don't let data binding become bidirectional for write-heavy forms without explicit validation gating — unconstrained two-way binding leads to cascading update loops"
    ],
    "donts_zh": [
      "不要从视图模型引用视图——这会破坏可测试性并创建循环依赖",
      "不要将导航逻辑直接放在视图中——导航决策属于视图模型或专用路由器，以便可以测试",
      "不要在简单屏幕中使用MVVM，直接的MVC或状态管理就足够时——绑定开销会增加复杂性而无收益",
      "不要让数据绑定在写密集型表单中不经过显式验证就成为双向——无约束的双向绑定会导致级联更新循环"
    ],
    "case_study_company": "Microsoft",
    "case_study": "Microsoft introduced MVVM to solve a fundamental problem with WPF development: UI designers working in Blend and developers writing C# code were constantly stepping on each other's work. By introducing ViewModels as the design-test boundary, the WPF team enabled a clean Blend-to-Visual-Studio handoff where designers manipulated XAML Views while developers wrote testable ViewModel classes. This workflow was later adopted by the Windows Phone platform and became the dominant mobile architecture pattern across the Microsoft ecosystem, reducing UI-related bug rates by enabling ViewModel-level unit testing without requiring device simulators.",
    "case_study_zh": "微软引入MVVM是为了解决WPF开发中的一个根本问题：在Blend中工作的UI设计师和编写C#代码的开发者经常相互干扰对方的工作。通过引入视图模型作为设计-测试边界，WPF团队实现了清晰的Blend到Visual Studio交接——设计师操作XAML视图，开发者编写可测试的视图模型类。这种工作流程后来被Windows Phone平台采用，成为整个微软生态系统中主流的移动架构模式，通过启用无需设备模拟器的视图模型级单元测试，降低了与UI相关的缺陷率。",
    "when_not_to_use": [
      "For simple read-only displays with no user interaction — the ViewModel and binding overhead is disproportionate",
      "In frameworks without data binding support, where implementing the binding infrastructure manually negates the pattern's advantages",
      "For server-rendered web applications where state lives on the server — MVVM is fundamentally a client-side pattern"
    ],
    "when_not_to_use_zh": [
      "对于没有用户交互的简单只读显示——视图模型和绑定开销与收益不成比例",
      "在没有数据绑定支持的框架中，手动实现绑定基础设施会消除该模式的优势",
      "对于状态存储在服务器端的服务器渲染Web应用——MVVM本质上是客户端模式"
    ],
    "adopters": [
      "Microsoft (WPF, UWP, Xamarin)",
      "SwiftUI (Apple)",
      "Vue.js",
      "Angular",
      "Knockout.js"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability",
      "testability"
    ],
    "maturity_ring": "established",
    "primary_source": "Gossman, J. (2005). Introduction to Model/View/ViewModel pattern for building WPF apps. Microsoft MSDN blogs.",
    "secondary_sources": [
      "Smith, J. (2009). WPF Apps With The Model-View-ViewModel Design Pattern. MSDN Magazine, February.",
      "Fowler, M. (2006). Presentation Model. martinfowler.com.",
      "Google (2022). Guide to app architecture — MVVM. Android Developers Documentation."
    ],
    "typed_relations": [
      {
        "slug": "separation-of-concerns",
        "type": "extends"
      },
      {
        "slug": "observer-pattern",
        "type": "complement"
      },
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "mvc",
        "type": "alternative"
      },
      {
        "slug": "mvp",
        "type": "alternative"
      },
      {
        "slug": "flux-unidirectional",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 318,
    "name": "MVP (Model-View-Presenter)",
    "name_zh": "MVP（模型-视图-呈现者）",
    "slug": "mvp",
    "category": "architecture",
    "desc": "Evolves MVC by replacing the Controller with a Presenter that holds all UI logic, while the View becomes a passive interface that delegates every user gesture to the Presenter.",
    "desc_zh": "在MVC基础上演进，将控制器替换为持有全部UI逻辑的呈现者，同时视图成为被动接口，将所有用户手势委托给呈现者处理。",
    "steps": [
      "Define the Model: domain data, persistence, and business rules with no UI dependencies",
      "Define a thin View interface: declare a minimal contract (interface/protocol) specifying only what the View can display and what events it can fire — no logic lives here",
      "Implement the Presenter: a plain class that implements the full UI decision logic; it holds a reference to the View interface and the Model, and orchestrates all interactions",
      "Implement the concrete View (Activity, Form, Widget): wire all user gestures to Presenter method calls and implement the View interface methods to update the display",
      "Write unit tests against the Presenter using a mock View — verify that given a specific user action, the Presenter calls the correct View methods with the correct data"
    ],
    "steps_zh": [
      "定义模型：领域数据、持久化和业务规则，不依赖UI",
      "定义精简的视图接口：声明最小化契约（接口/协议），仅指定视图可以显示什么以及可以触发什么事件——这里不存放任何逻辑",
      "实现呈现者：持有完整UI决策逻辑的普通类；持有视图接口和模型的引用，协调所有交互",
      "实现具体视图（Activity、Form、Widget）：将所有用户手势连接到呈现者方法调用，并实现视图接口方法以更新显示",
      "使用模拟视图对呈现者编写单元测试——验证给定特定用户操作时，呈现者用正确数据调用正确的视图方法"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Model",
      "View",
      "Presenter",
      "User Event",
      "Update"
    ],
    "viz_labels_zh": [
      "模型",
      "视图",
      "表示器",
      "用户事件",
      "更新"
    ],
    "related": [
      "separation-of-concerns",
      "solid-principles",
      "observer-pattern",
      "mvc",
      "mvvm",
      "flux-unidirectional"
    ],
    "tags": [
      "ui-pattern",
      "testability",
      "component-architecture",
      "presentation-pattern"
    ],
    "origin_author": "Mike Potel / Taligent, 1996",
    "origin_source": "Potel, M. (1996). MVP: Model-View-Presenter — The Taligent Programming Model for C++ and Java. Taligent Inc. white paper.",
    "origin_source_zh": "Potel, M.（1996）。《MVP：模型-视图-呈现者——C++和Java的Taligent编程模型》。Taligent Inc.白皮书。",
    "complexity": "intermediate",
    "when_to_use": [
      "When UI logic must be fully unit-tested without launching the actual UI — the Presenter's View interface can be mocked trivially",
      "When working with UI frameworks (Android Activities, Windows Forms) where the View class is difficult to instantiate in tests",
      "When multiple Views need to share identical UI logic — the same Presenter can back different platform-specific View implementations",
      "When the team wants to enforce a strict passive-View constraint to prevent ad-hoc logic accumulating in UI components"
    ],
    "when_to_use_zh": [
      "UI逻辑必须在不启动实际UI的情况下完整单元测试时——呈现者的视图接口可以轻松模拟",
      "使用UI框架（Android Activity、Windows Forms）时，其中视图类难以在测试中实例化",
      "多个视图需要共享相同UI逻辑时——同一呈现者可以支撑不同平台特定的视图实现",
      "团队希望强制执行严格的被动视图约束，防止临时逻辑在UI组件中积累时"
    ],
    "core_concepts": [
      "Passive View: the View implements a minimal interface and contains zero logic — it delegates every event to the Presenter and renders exactly what it is told",
      "Presenter: the mediator between View and Model; it owns all UI decision logic and communicates with the View only through the View interface",
      "View interface: a contract (Java interface, Swift protocol, .NET interface) that decouples the Presenter from the concrete UI framework class",
      "Testability by mock: because the Presenter depends only on the View interface, tests replace the real View with a mock object to verify behavior",
      "One-to-one relationship: each View typically has exactly one Presenter; the Presenter is lifecycle-aware and manages the View's full state"
    ],
    "core_concepts_zh": [
      "被动视图：视图实现最小化接口且包含零逻辑——它将每个事件委托给呈现者，并精确渲染被告知的内容",
      "呈现者：视图和模型之间的中介；拥有所有UI决策逻辑，仅通过视图接口与视图通信",
      "视图接口：将呈现者与具体UI框架类解耦的契约（Java接口、Swift协议、.NET接口）",
      "通过模拟实现可测试性：因为呈现者仅依赖视图接口，测试用模拟对象替换真实视图来验证行为",
      "一对一关系：每个视图通常只有一个呈现者；呈现者感知生命周期并管理视图的完整状态"
    ],
    "timeline": [
      [
        "1996",
        "Mike Potel at Taligent publishes the MVP white paper, positioning it as an evolution of MVC for C++ and Java applications"
      ],
      [
        "2006",
        "Martin Fowler refines and publishes the Passive View and Supervising Controller variants of MVP on martinfowler.com"
      ],
      [
        "2008",
        "Android adopts Activity-based architecture that closely resembles MVP; community gravitates toward explicit MVP frameworks"
      ],
      [
        "2015",
        "MVP becomes the dominant Android architecture pattern with frameworks like Mosby; eventually superseded by MVVM with Architecture Components in 2017"
      ]
    ],
    "timeline_zh": [
      [
        "1996",
        "Taligent的Mike Potel发布MVP白皮书，将其定位为面向C++和Java应用的MVC演进"
      ],
      [
        "2006",
        "Martin Fowler在martinfowler.com上细化并发布MVP的被动视图和监督控制器变体"
      ],
      [
        "2008",
        "Android采用类似MVP的基于Activity的架构；社区转向明确的MVP框架"
      ],
      [
        "2015",
        "MVP成为主流Android架构模式（Mosby等框架）；2017年最终被架构组件中的MVVM取代"
      ]
    ],
    "dos": [
      "Define the View as an interface with the minimum surface area needed — every extra method on the View interface is a method you must mock in tests",
      "Keep Presenters stateless where possible, or clearly document the state they manage, so that tests can set up known preconditions",
      "One Presenter per View screen — splitting a large screen into sub-Views each with their own Presenter improves cohesion and makes tests focused",
      "Inject dependencies (Model, services) into the Presenter constructor so tests can provide substitutes without service locators"
    ],
    "dos_zh": [
      "将视图定义为具有最小所需接口面积的接口——视图接口上的每个额外方法都是必须在测试中模拟的方法",
      "尽可能保持呈现者无状态，或清楚记录其管理的状态，以便测试可以设置已知前置条件",
      "每个视图屏幕一个呈现者——将大型屏幕拆分为各自具有呈现者的子视图可以提高内聚性并使测试更聚焦",
      "将依赖项（模型、服务）注入呈现者构造函数，以便测试可以提供替代品而无需服务定位器"
    ],
    "donts": [
      "Don't let the Presenter hold a direct reference to a concrete View class — this prevents mocking and couples the Presenter to the UI framework",
      "Don't call View methods from background threads without marshalling back to the UI thread — threading bugs create intermittent, hard-to-reproduce failures",
      "Don't put navigation logic in the View — navigation belongs in the Presenter or a Router so the routing decision is testable",
      "Don't skip the View interface because it seems like overhead — skipping it is the single most common mistake that makes MVP code untestable"
    ],
    "donts_zh": [
      "不要让呈现者持有对具体视图类的直接引用——这会阻止模拟并将呈现者与UI框架耦合",
      "不要从后台线程调用视图方法而不回调到UI线程——线程错误会产生间歇性、难以重现的故障",
      "不要将导航逻辑放在视图中——导航属于呈现者或路由器，使路由决策可测试",
      "不要因为视图接口看似额外开销就跳过它——跳过它是使MVP代码无法测试的最常见错误"
    ],
    "case_study_company": "Google (Android)",
    "case_study": "Google's Android team published the Android MVP sample in 2015 as part of the android-architecture repository to demonstrate how to write testable Android applications. Prior to MVP adoption, Android code was concentrated in Activities that mixed UI, business logic, and data access — making unit testing nearly impossible without Robolectric's slow JVM-based Android simulation. By extracting all UI decisions into plain Java Presenters backed by View interfaces, teams achieved Presenter test suites that ran in under 100ms on the JVM, compared to 30+ seconds with instrumented tests on emulators.",
    "case_study_zh": "谷歌Android团队在2015年作为android-architecture仓库的一部分发布了Android MVP示例，展示如何编写可测试的Android应用程序。在采用MVP之前，Android代码集中在混合了UI、业务逻辑和数据访问的Activity中——使得单元测试在没有Robolectric慢速JVM-based Android模拟的情况下几乎不可能。通过将所有UI决策提取到由视图接口支撑的普通Java呈现者中，团队在JVM上实现了100ms以内运行的呈现者测试套件，而在模拟器上的仪表化测试需要30秒以上。",
    "when_not_to_use": [
      "For simple CRUD screens where the UI logic is trivial — the View interface boilerplate adds more code than it saves in testing effort",
      "When using reactive frameworks (RxJava, Kotlin Flow, Combine) that naturally enable MVVM with less ceremony than MVP",
      "For highly dynamic UIs where the View must frequently update its own structure — the passive View constraint becomes a bottleneck"
    ],
    "when_not_to_use_zh": [
      "对于UI逻辑微不足道的简单CRUD屏幕——视图接口样板代码增加的代码量超过了在测试工作中节省的量",
      "使用响应式框架（RxJava、Kotlin Flow、Combine）时，这些框架自然地以比MVP更少的仪式实现MVVM",
      "对于视图必须频繁更新自身结构的高度动态UI——被动视图约束会成为瓶颈"
    ],
    "adopters": [
      "Android (early community standard)",
      "Google Web Toolkit (GWT)",
      "Windows Forms (.NET)",
      "Vaadin (Java web)"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "testability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Potel, M. (1996). MVP: Model-View-Presenter. Taligent Inc. white paper.",
    "secondary_sources": [
      "Fowler, M. (2006). Passive View. martinfowler.com.",
      "Fowler, M. (2006). Supervising Controller. martinfowler.com.",
      "Google (2015). Android Architecture Blueprints — MVP sample. github.com/android/architecture-samples."
    ],
    "typed_relations": [
      {
        "slug": "separation-of-concerns",
        "type": "extends"
      },
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "observer-pattern",
        "type": "related"
      },
      {
        "slug": "mvc",
        "type": "alternative"
      },
      {
        "slug": "mvvm",
        "type": "alternative"
      },
      {
        "slug": "flux-unidirectional",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 319,
    "name": "Clean Architecture",
    "name_zh": "整洁架构",
    "slug": "clean-architecture",
    "category": "architecture",
    "desc": "Organizes code into concentric dependency rings — Entities, Use Cases, Interface Adapters, Frameworks — where the Dependency Rule mandates all source-code dependencies point inward, making the system independent of UI, database, and frameworks.",
    "desc_zh": "将代码组织为同心依赖环——实体、用例、接口适配器、框架——依赖规则规定所有源代码依赖指向内部，使系统独立于UI、数据库和框架。",
    "steps": [
      "Identify the core Entities: business objects and rules that exist independent of any application — these form the innermost ring and have zero external dependencies",
      "Define Use Cases (Interactors): application-specific business rules that orchestrate Entities; each Use Case captures one piece of user-facing functionality and depends only on Entities",
      "Define Interface Adapters: Controllers, Presenters, and Gateways that convert data between the format convenient for Use Cases and the format required by external systems (DB, web, UI)",
      "Place all external concerns (frameworks, drivers, UI) in the outermost ring: these are implementation details that can be swapped without affecting any inner ring",
      "Enforce the Dependency Rule mechanically: use dependency inversion (interfaces defined in inner rings, implemented in outer rings) to ensure no inner ring ever imports from an outer ring"
    ],
    "steps_zh": [
      "识别核心实体：独立于任何应用程序存在的业务对象和规则——这些形成最内层环，零外部依赖",
      "定义用例（交互器）：协调实体的应用程序特定业务规则；每个用例捕获一项面向用户的功能，仅依赖实体",
      "定义接口适配器：控制器、呈现者和网关，在用例方便的格式和外部系统（数据库、Web、UI）所需格式之间转换数据",
      "将所有外部关注点（框架、驱动程序、UI）放在最外层环：这些是可以在不影响任何内层环的情况下替换的实现细节",
      "机械地强制执行依赖规则：使用依赖倒置（在内层环中定义接口，在外层环中实现）确保内层环永远不从外层环导入"
    ],
    "ai_relevant": false,
    "viz_type": "concentric",
    "viz_labels": [
      "Entities",
      "Use Cases",
      "Adapters",
      "Frameworks",
      "Dependency Rule"
    ],
    "viz_labels_zh": [
      "实体",
      "用例",
      "适配器",
      "框架层",
      "依赖规则"
    ],
    "related": [
      "hexagonal-architecture",
      "solid-principles",
      "domain-driven-design",
      "onion-architecture",
      "n-tier-layered",
      "ports-and-adapters"
    ],
    "tags": [
      "dependency-inversion",
      "testability",
      "framework-independence",
      "enterprise-architecture"
    ],
    "origin_author": "Robert C. Martin (Uncle Bob), 2012",
    "origin_source": "Martin, R.C. (2012). The Clean Architecture. The Clean Coder Blog, August 13. blog.cleancoder.com.",
    "origin_source_zh": "Martin, R.C.（2012）。《整洁架构》。The Clean Coder博客，8月13日。blog.cleancoder.com。",
    "complexity": "advanced",
    "when_to_use": [
      "When the application must remain testable without any external infrastructure — databases, web servers, or UI frameworks should be optional at test time",
      "When the team anticipates replacing or upgrading the database, web framework, or UI technology during the system's lifetime",
      "When domain rules are complex enough to justify the overhead — Clean Architecture pays dividends when business logic has many rules and variations",
      "When building a long-lived enterprise application where multiple teams need clear, enforced boundaries between concerns"
    ],
    "when_to_use_zh": [
      "应用程序必须在没有任何外部基础设施的情况下可测试时——数据库、Web服务器或UI框架在测试时应该是可选的",
      "团队预期在系统生命周期内替换或升级数据库、Web框架或UI技术时",
      "领域规则足够复杂以证明开销合理时——当业务逻辑有许多规则和变体时，整洁架构带来回报",
      "构建多团队需要清晰、强制执行关注点边界的长寿命企业应用程序时"
    ],
    "core_concepts": [
      "The Dependency Rule: source code dependencies must always point inward — outer rings know about inner rings, never the reverse",
      "Entities: enterprise-wide business rules encapsulated as objects; the most stable, rarely-changing core of the system",
      "Use Cases: application-specific business rules that define the system's behavior; they direct data flow to and from Entities",
      "Interface Adapters: the translation layer — Controllers convert web requests to Use Case inputs; Presenters convert Use Case outputs to View models; Gateways implement Repository interfaces defined by Use Cases",
      "Dependency Inversion at the boundary: inner rings define interfaces (Repository, Presenter interface); outer rings provide implementations — control flows inward but dependency points outward, toward the inner ring's interface"
    ],
    "core_concepts_zh": [
      "依赖规则：源代码依赖必须始终指向内部——外层环了解内层环，反之则不然",
      "实体：封装为对象的企业范围业务规则；系统中最稳定、最少变化的核心",
      "用例：定义系统行为的应用程序特定业务规则；它们指导数据流向和来自实体",
      "接口适配器：翻译层——控制器将Web请求转换为用例输入；呈现者将用例输出转换为视图模型；网关实现由用例定义的仓储接口",
      "边界处的依赖倒置：内层环定义接口（仓储、呈现者接口）；外层环提供实现——控制流向内但依赖指向外部，朝向内层环的接口"
    ],
    "timeline": [
      [
        "2012",
        "Robert C. Martin publishes the Clean Architecture blog post, synthesizing prior work on Hexagonal, Onion, and BCE architectures"
      ],
      [
        "2017",
        "Martin publishes 「Clean Architecture: A Craftsman's Guide to Software Structure and Design」, formalizing the ring diagram"
      ],
      [
        "2018",
        "The Android community adopts Clean Architecture as the de-facto standard, influencing Google's official Architecture Components guidance"
      ],
      [
        "2020",
        "Clean Architecture becomes a standard teaching pattern in Go, Kotlin, and .NET communities with framework-specific adaptations"
      ]
    ],
    "timeline_zh": [
      [
        "2012",
        "Robert C. Martin发布整洁架构博客文章，综合了六边形、洋葱和BCE架构的先前工作"
      ],
      [
        "2017",
        "Martin出版《整洁架构：软件结构与设计匠艺指南》，正式确立环形图"
      ],
      [
        "2018",
        "Android社区将整洁架构采纳为事实标准，影响谷歌官方架构组件指南"
      ],
      [
        "2020",
        "整洁架构成为Go、Kotlin和.NET社区的标准教学模式，具有框架特定的适配"
      ]
    ],
    "dos": [
      "Define all repository and external service interfaces inside the Use Case ring — the Use Case owns the contract, the outer ring provides the implementation",
      "Pass simple data structures (DTOs, primitives) across ring boundaries rather than rich domain objects — this prevents leaking inner-ring types into outer rings",
      "Write Use Case unit tests that supply all dependencies as in-memory fakes — a Use Case test that touches a real database or network is not a unit test",
      "Use package/module naming that reflects the ring — e.g., domain, application, infrastructure — so that import direction violations are immediately visible"
    ],
    "dos_zh": [
      "在用例环内定义所有仓储和外部服务接口——用例拥有契约，外层环提供实现",
      "跨环边界传递简单数据结构（DTO、基本类型）而非丰富领域对象——这防止内层环类型泄漏到外层环",
      "编写将所有依赖项作为内存假实现提供的用例单元测试——接触真实数据库或网络的用例测试不是单元测试",
      "使用反映环层的包/模块命名——例如domain、application、infrastructure——使导入方向违规立即可见"
    ],
    "donts": [
      "Don't let framework annotations (Spring @Entity, JPA annotations) leak into the Entity ring — this couples your most stable code to your most volatile dependency",
      "Don't create a single monolithic Use Case class that handles an entire feature area — each Use Case should do one thing and express one business intention",
      "Don't cross ring boundaries in tests to save setup time — testing an outer ring object by letting it call into the real inner ring defeats the isolation purpose",
      "Don't apply Clean Architecture to small scripts or microservices with trivial logic — the indirection overhead outweighs the benefits for simple CRUD operations"
    ],
    "donts_zh": [
      "不要让框架注解（Spring @Entity、JPA注解）泄漏到实体环——这将最稳定的代码与最易变的依赖耦合",
      "不要创建处理整个功能区域的单一整体用例类——每个用例应该做一件事并表达一个业务意图",
      "不要在测试中跨越环边界以节省设置时间——通过让外层环对象调用真实内层环来测试它会破坏隔离目的",
      "不要将整洁架构应用于具有微不足道逻辑的小型脚本或微服务——对于简单的CRUD操作，间接开销超过收益"
    ],
    "case_study_company": "Nubank",
    "case_study": "Nubank, the Brazilian fintech serving over 90 million customers, adopted Clean Architecture across its Clojure and Kotlin microservices to enforce strict separation between rapidly-changing business rules and stable infrastructure choices. By placing all financial calculation and compliance rules in the innermost Entity and Use Case rings, Nubank's engineering team was able to replace their persistence layer (migrating from Datomic to PostgreSQL for specific services) without touching business logic code. Their Use Case test suites run in under 5 seconds with in-memory repositories, enabling a TDD workflow that catches business rule regressions before any infrastructure is involved.",
    "case_study_zh": "服务超过9000万客户的巴西金融科技公司Nubank在其Clojure和Kotlin微服务中采用整洁架构，强制在快速变化的业务规则和稳定的基础设施选择之间严格分离。通过将所有财务计算和合规规则放在最内层的实体和用例环中，Nubank的工程团队能够在不接触业务逻辑代码的情况下替换其持久化层（将特定服务从Datomic迁移到PostgreSQL）。他们的用例测试套件使用内存仓储在5秒内运行，实现了在涉及任何基础设施之前捕获业务规则退化的TDD工作流。",
    "case_study_challenge": "Nubank, serving over 90 million customers in Brazil, needed to evolve its financial calculation and compliance logic at startup speed while maintaining the regulatory rigor of a bank. Infrastructure choices made years earlier were becoming constraints, but changing them risked breaking critical business rules embedded throughout the codebase.",
    "case_study_challenge_zh": "服务巴西超过9000万客户的Nubank，需要以创业公司的速度迭代其金融计算和合规逻辑，同时保持银行级别的监管严谨性。数年前做出的基础设施选择正在成为约束，但更换它们有可能破坏散布在代码库中的关键业务规则。",
    "case_study_approach": "Nubank adopted Clean Architecture across its Clojure and Kotlin microservices, placing all financial calculation and compliance rules in the innermost Entity and Use Case rings. Infrastructure — databases, message queues, external APIs — lived in outer rings behind port interfaces. Business logic had zero import dependencies on any framework or database driver.",
    "case_study_approach_zh": "Nubank在其Clojure和Kotlin微服务中采用整洁架构，将所有财务计算和合规规则放在最内层的实体和用例环中。基础设施——数据库、消息队列、外部API——位于端口接口之后的外层环中。业务逻辑对任何框架或数据库驱动都没有导入依赖。",
    "case_study_result": "The team replaced their persistence layer — migrating specific services from Datomic to PostgreSQL — without touching a single line of business logic code. Use Case test suites run in under 5 seconds with in-memory repositories, enabling a TDD workflow that catches business rule regressions before any infrastructure is involved.",
    "case_study_result_zh": "团队替换了持久化层——将特定服务从Datomic迁移到PostgreSQL——而未触及一行业务逻辑代码。用例测试套件使用内存仓储在5秒内运行完毕，实现了在涉及任何基础设施之前就能捕获业务规则退化的TDD工作流。",
    "case_study_quote": "We can swap a database in a week. We cannot swap a business rule that has been tested against a million edge cases. Clean Architecture lets us protect what matters.",
    "case_study_quote_zh": "我们可以在一周内更换数据库，但无法替换一条经过百万边界情况验证的业务规则。整洁架构让我们保护了真正重要的东西。",
    "when_not_to_use": [
      "For simple CRUD microservices where the business logic is thin and the value of the pattern's indirection layers is negative",
      "For quick prototypes or MVPs where time-to-market matters more than long-term maintainability",
      "For teams unfamiliar with dependency inversion — misapplied Clean Architecture creates complexity without the promised testability benefits"
    ],
    "when_not_to_use_zh": [
      "对于业务逻辑单薄的简单CRUD微服务，该模式间接层的价值为负",
      "对于上市时间比长期可维护性更重要的快速原型或MVP",
      "对于不熟悉依赖倒置的团队——错误应用的整洁架构会创建复杂性而没有承诺的可测试性收益"
    ],
    "adopters": [
      "Nubank",
      "Android community (standard pattern)",
      ".NET community (Clean Architecture template)",
      "Go microservices community"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "testability",
      "portability"
    ],
    "maturity_ring": "established",
    "primary_source": "Martin, R.C. (2012). The Clean Architecture. blog.cleancoder.com, August 13.",
    "secondary_sources": [
      "Martin, R.C. (2017). Clean Architecture: A Craftsman's Guide to Software Structure and Design. Prentice Hall.",
      "Palermo, J. (2008). The Onion Architecture. jeffreypalermo.com.",
      "Cockburn, A. (2005). Hexagonal Architecture (Ports and Adapters). alistair.cockburn.us."
    ],
    "typed_relations": [
      {
        "slug": "hexagonal-architecture",
        "type": "related"
      },
      {
        "slug": "solid-principles",
        "type": "prerequisite"
      },
      {
        "slug": "domain-driven-design",
        "type": "complement"
      },
      {
        "slug": "onion-architecture",
        "type": "alternative"
      },
      {
        "slug": "n-tier-layered",
        "type": "alternative"
      },
      {
        "slug": "ports-and-adapters",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 320,
    "name": "Onion Architecture",
    "name_zh": "洋葱架构",
    "slug": "onion-architecture",
    "category": "architecture",
    "desc": "Structures applications as concentric rings around a Domain Model core, where all dependencies flow inward and infrastructure lives in the outermost ring, making the domain model completely independent of persistence and UI concerns.",
    "desc_zh": "将应用程序构建为围绕领域模型核心的同心环，所有依赖指向内部，基础设施位于最外层环，使领域模型完全独立于持久化和UI关注点。",
    "steps": [
      "Place the Domain Model at the core: pure domain objects (entities, value objects, aggregates) with no dependencies on any outer ring or external library",
      "Wrap with Domain Services: interfaces and services that operate on the Domain Model but remain framework-independent; repository interfaces are defined here",
      "Add Application Services: orchestration logic that coordinates domain objects to fulfill use cases; this ring depends on Domain Model and Domain Services but not on infrastructure",
      "Implement the Infrastructure ring: concrete implementations of repository interfaces, ORM mappings, external API clients, and UI frameworks — all pointing inward",
      "Wire via dependency injection: the application entry point (e.g., startup class) instantiates infrastructure implementations and injects them into application services through the interfaces defined in inner rings"
    ],
    "steps_zh": [
      "将领域模型置于核心：纯领域对象（实体、值对象、聚合），不依赖任何外层环或外部库",
      "用领域服务包裹：在领域模型上操作但保持框架无关的接口和服务；仓储接口在此处定义",
      "添加应用服务：协调领域对象以完成用例的编排逻辑；此环依赖领域模型和领域服务，但不依赖基础设施",
      "实现基础设施环：仓储接口的具体实现、ORM映射、外部API客户端和UI框架——全部指向内部",
      "通过依赖注入连接：应用程序入口点（例如启动类）实例化基础设施实现，并通过内层环定义的接口将其注入应用服务"
    ],
    "ai_relevant": false,
    "viz_type": "concentric",
    "viz_labels": [
      "Domain Model",
      "Domain Services",
      "Application",
      "Infrastructure"
    ],
    "viz_labels_zh": [
      "领域模型",
      "领域服务",
      "应用服务",
      "基础设施"
    ],
    "related": [
      "hexagonal-architecture",
      "domain-driven-design",
      "solid-principles",
      "clean-architecture",
      "n-tier-layered",
      "ports-and-adapters"
    ],
    "tags": [
      "domain-model",
      "dependency-inversion",
      "testability",
      "enterprise-architecture"
    ],
    "origin_author": "Jeffrey Palermo, 2008",
    "origin_source": "Palermo, J. (2008). The Onion Architecture: part 1–4. jeffreypalermo.com, July–August.",
    "origin_source_zh": "Palermo, J.（2008）。《洋葱架构：第1-4部分》。jeffreypalermo.com，7月至8月。",
    "complexity": "intermediate",
    "when_to_use": [
      "When domain logic is complex and must remain insulated from framework churn — changing the ORM or web framework should not touch domain code",
      "When practicing Domain-Driven Design and needing an architectural pattern that enforces the domain model's primacy",
      "When the team needs to test domain and application logic in isolation from databases and external services",
      "When building .NET enterprise applications where the pattern has strong tooling and community support"
    ],
    "when_to_use_zh": [
      "领域逻辑复杂且必须与框架更迭隔离时——更换ORM或Web框架不应接触领域代码",
      "实践领域驱动设计且需要强制执行领域模型首要地位的架构模式时",
      "团队需要在与数据库和外部服务隔离的情况下测试领域和应用逻辑时",
      "构建该模式具有强大工具和社区支持的.NET企业应用程序时"
    ],
    "core_concepts": [
      "Domain Model at the core: the innermost ring contains pure domain objects — no framework dependencies, no infrastructure imports, maximum stability",
      "Inward dependency rule: every ring depends only on rings closer to the center, never on rings farther out",
      "Repository interfaces inward: repository interfaces are defined inside the Domain Services ring; concrete implementations live in the outermost Infrastructure ring",
      "Application Services ring: thin orchestration layer that calls domain services and repositories to fulfill use cases; does not contain business rules",
      "Infrastructure at the periphery: everything that touches the outside world (databases, file systems, external APIs, UI) is isolated in the outermost ring and can be replaced independently"
    ],
    "core_concepts_zh": [
      "核心处的领域模型：最内层环包含纯领域对象——无框架依赖，无基础设施导入，最高稳定性",
      "向内依赖规则：每个环仅依赖更靠近中心的环，从不依赖更远的环",
      "仓储接口指向内部：仓储接口在领域服务环内定义；具体实现位于最外层基础设施环",
      "应用服务环：调用领域服务和仓储以完成用例的精简编排层；不包含业务规则",
      "外围的基础设施：所有接触外部世界的内容（数据库、文件系统、外部API、UI）都隔离在最外层环，可以独立替换"
    ],
    "timeline": [
      [
        "2008",
        "Jeffrey Palermo publishes the four-part Onion Architecture series on his blog, coining the term and the ring diagram"
      ],
      [
        "2010",
        "The pattern gains traction in the .NET community as an alternative to traditional N-Tier that enables domain-centric design"
      ],
      [
        "2012",
        "Robert C. Martin acknowledges Onion Architecture as a predecessor when publishing Clean Architecture, noting their shared Dependency Rule"
      ],
      [
        "2015",
        "Onion Architecture becomes a standard recommendation in .NET enterprise development circles and DDD-oriented project templates"
      ]
    ],
    "timeline_zh": [
      [
        "2008",
        "Jeffrey Palermo在其博客上发布四部分洋葱架构系列，创造该术语和环形图"
      ],
      [
        "2010",
        "该模式在.NET社区获得牵引力，作为传统N层架构的替代方案，实现领域中心设计"
      ],
      [
        "2012",
        "Robert C. Martin在发布整洁架构时承认洋葱架构是其前身，指出它们共享的依赖规则"
      ],
      [
        "2015",
        "洋葱架构成为.NET企业开发圈和面向DDD项目模板的标准推荐"
      ]
    ],
    "dos": [
      "Define all repository and service interfaces inside the Domain Services ring, never in the Infrastructure ring, so that domain code never imports from infrastructure",
      "Keep the Domain Model ring free of all annotations and framework-specific base classes — a pure POCO/POJO domain model can be tested without bootstrapping the framework",
      "Use dependency injection containers configured at the application entry point to bind infrastructure implementations to inner-ring interfaces",
      "Separate Application Services (use case orchestration) from Domain Services (business rules applied to domain objects) — conflating them erodes the ring structure"
    ],
    "dos_zh": [
      "在领域服务环内定义所有仓储和服务接口，绝不在基础设施环中，以确保领域代码永不从基础设施导入",
      "保持领域模型环不含所有注解和框架特定的基类——纯POCO/POJO领域模型可以不启动框架即可测试",
      "使用在应用程序入口点配置的依赖注入容器将基础设施实现绑定到内层环接口",
      "将应用服务（用例编排）与领域服务（应用于领域对象的业务规则）分开——混淆它们会侵蚀环结构"
    ],
    "donts": [
      "Don't place Entity Framework or Hibernate entity classes in the Domain Model ring — ORM-decorated entities couple the domain to the persistence framework",
      "Don't skip the Application Services ring and call domain objects directly from the UI or API layer — this bypasses authorization, validation, and orchestration logic",
      "Don't allow infrastructure concerns (connection strings, retry policies) to bleed into application services — infrastructure configuration belongs in the outermost ring",
      "Don't treat Onion Architecture as simply renaming N-Tier layers — the critical difference is the direction of dependencies and where interfaces are defined"
    ],
    "donts_zh": [
      "不要将Entity Framework或Hibernate实体类放在领域模型环中——ORM装饰的实体将领域与持久化框架耦合",
      "不要跳过应用服务环直接从UI或API层调用领域对象——这会绕过授权、验证和编排逻辑",
      "不要让基础设施关注点（连接字符串、重试策略）渗入应用服务——基础设施配置属于最外层环",
      "不要将洋葱架构视为简单地重命名N层架构层——关键区别在于依赖方向和接口定义位置"
    ],
    "case_study_company": "ThoughtWorks",
    "case_study": "ThoughtWorks consultants adopted Onion Architecture on a large UK insurance platform migration project, replacing a fragile N-Tier monolith where business rules were scattered across stored procedures, service classes, and UI code. By restructuring around a domain model core, the team isolated all premium calculation and policy validation rules in a pure domain ring that could be unit-tested in milliseconds without database or web server dependencies. After six months, the domain ring had 94% unit test coverage with test suites running in 8 seconds, enabling confident refactoring of the 12-year-old business rules.",
    "case_study_zh": "ThoughtWorks顾问在一个大型英国保险平台迁移项目中采用洋葱架构，替换一个脆弱的N层单体系统——业务规则分散在存储过程、服务类和UI代码中。通过围绕领域模型核心重构，团队将所有保费计算和保单验证规则隔离在纯领域环中，可以在不依赖数据库或Web服务器的情况下在毫秒内进行单元测试。六个月后，领域环达到94%的单元测试覆盖率，测试套件在8秒内运行，实现了对12年历史业务规则的自信重构。",
    "when_not_to_use": [
      "For microservices with thin business logic where a simple three-layer structure (API, Service, Repository) provides sufficient separation",
      "When the team is not practicing DDD — the Onion Architecture's ring structure maps naturally to DDD concepts; without DDD fluency the rings tend to collapse",
      "For frontend applications — the pattern is server-side focused and does not translate cleanly to UI component architectures"
    ],
    "when_not_to_use_zh": [
      "对于业务逻辑单薄的微服务，简单的三层结构（API、服务、仓储）提供了足够的分离",
      "当团队不实践DDD时——洋葱架构的环结构自然映射到DDD概念；没有DDD流畅度，环往往会塌陷",
      "对于前端应用程序——该模式以服务器端为中心，不能清晰地转化为UI组件架构"
    ],
    "adopters": [
      ".NET enterprise community",
      "Java Spring community",
      "ThoughtWorks client projects"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "testability"
    ],
    "maturity_ring": "established",
    "primary_source": "Palermo, J. (2008). The Onion Architecture: parts 1–4. jeffreypalermo.com.",
    "secondary_sources": [
      "Martin, R.C. (2017). Clean Architecture: A Craftsman's Guide to Software Structure and Design. Prentice Hall.",
      "Evans, E. (2003). Domain-Driven Design: Tackling Complexity in the Heart of Software. Addison-Wesley.",
      "Smith, S. (2020). Clean Architecture solution template for ASP.NET Core. github.com/ardalis/CleanArchitecture."
    ],
    "typed_relations": [
      {
        "slug": "hexagonal-architecture",
        "type": "related"
      },
      {
        "slug": "domain-driven-design",
        "type": "complement"
      },
      {
        "slug": "solid-principles",
        "type": "prerequisite"
      },
      {
        "slug": "clean-architecture",
        "type": "alternative"
      },
      {
        "slug": "n-tier-layered",
        "type": "alternative"
      },
      {
        "slug": "ports-and-adapters",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 321,
    "name": "N-Tier / Layered Architecture",
    "name_zh": "N层/分层架构",
    "slug": "n-tier-layered",
    "category": "architecture",
    "desc": "Organizes software into horizontal layers — typically Presentation, Business Logic, and Data Access — where each layer depends only on the layer directly below it, establishing clear separation of concerns across the entire application.",
    "desc_zh": "将软件组织为水平层次——通常是表示层、业务逻辑层和数据访问层——其中每层仅依赖其正下方的层，在整个应用程序中建立清晰的关注点分离。",
    "steps": [
      "Identify the layers appropriate to the application: minimally Presentation (UI/API), Business Logic (domain/service), and Data Access (repository/ORM); large enterprise systems may add Security, Integration, and Messaging layers",
      "Define the contract between adjacent layers: each layer exposes a service interface to the layer above; the implementation details of each layer are hidden from higher layers",
      "Implement each layer independently: Presentation handles HTTP requests and view rendering; Business Logic applies rules and orchestrates operations; Data Access manages persistence",
      "Enforce the constraint that each layer may only call the layer immediately below it — Presentation must not access Data Access directly, bypassing Business Logic",
      "Deploy and scale layers independently where needed: web tier can be scaled horizontally while the database tier uses vertical scaling"
    ],
    "steps_zh": [
      "确定适合应用程序的层次：最小为表示层（UI/API）、业务逻辑层（领域/服务）和数据访问层（仓储/ORM）；大型企业系统可能添加安全、集成和消息传递层",
      "定义相邻层之间的契约：每层向其上方的层暴露服务接口；每层的实现细节对高层隐藏",
      "独立实现每层：表示层处理HTTP请求和视图渲染；业务逻辑层应用规则并协调操作；数据访问层管理持久化",
      "强制执行每层只能调用其正下方层的约束——表示层不得绕过业务逻辑层直接访问数据访问层",
      "在需要时独立部署和扩展各层：Web层可以水平扩展，而数据库层使用垂直扩展"
    ],
    "ai_relevant": false,
    "viz_type": "pyramid",
    "viz_labels": [
      "Presentation",
      "Application",
      "Business",
      "Data"
    ],
    "viz_labels_zh": [
      "展示层",
      "应用层",
      "业务层",
      "数据层"
    ],
    "related": [
      "layered-architecture",
      "separation-of-concerns",
      "solid-principles",
      "clean-architecture",
      "hexagonal-architecture",
      "onion-architecture",
      "ports-and-adapters"
    ],
    "tags": [
      "enterprise-architecture",
      "separation-of-concerns",
      "layering",
      "scalability"
    ],
    "origin_author": "1990s enterprise computing; formalized by Microsoft and Sun Microsystems",
    "origin_source": "Fowler, M. (2002). Patterns of Enterprise Application Architecture. Addison-Wesley (Layered Architecture pattern).",
    "origin_source_zh": "Fowler, M.（2002）。《企业应用架构模式》。Addison-Wesley（分层架构模式）。",
    "complexity": "beginner",
    "when_to_use": [
      "When building a standard business application — CRUD-heavy systems, content management, or transactional backends — where simplicity and team familiarity matter",
      "When the team is new to software architecture and needs a pattern with clear, enforceable rules that are easy to communicate",
      "When using enterprise frameworks (Java EE, Spring, ASP.NET, PHP Laravel) that are built around layered conventions",
      "When organizational separation of concerns maps naturally to layers — e.g., separate teams own the UI, services, and database schemas"
    ],
    "when_to_use_zh": [
      "构建标准业务应用程序——以CRUD为主的系统、内容管理或事务后端——简洁性和团队熟悉度很重要时",
      "团队对软件架构陌生，需要具有清晰、可执行且易于传达规则的模式时",
      "使用围绕分层约定构建的企业框架（Java EE、Spring、ASP.NET、PHP Laravel）时",
      "当组织关注点分离自然映射到层次时——例如，独立团队拥有UI、服务和数据库模式"
    ],
    "core_concepts": [
      "Horizontal layers: each layer groups related responsibilities — Presentation (what users see), Business Logic (what the system does), Data Access (how data is stored)",
      "Layer contract: each layer exposes a well-defined API surface to the layer above and hides its implementation — swapping a layer should not affect layers above it",
      "Strict vs. relaxed layering: strict layering forbids skipping layers; relaxed layering allows a layer to call any layer below it for performance reasons",
      "Physical vs. logical tiers: layers are a logical separation; tiers are the physical deployment — a two-tier deployment can implement a three-layer logical design",
      "Sinkholes anti-pattern: when a layer simply passes a request through to the layer below without adding value, the architecture becomes unnecessarily complex for its benefit"
    ],
    "core_concepts_zh": [
      "水平层次：每层分组相关职责——表示层（用户所见）、业务逻辑层（系统所做）、数据访问层（数据存储方式）",
      "层次契约：每层向其上方的层暴露定义良好的API接口并隐藏其实现——替换一层不应影响其上方的层",
      "严格与宽松分层：严格分层禁止跳层；宽松分层允许一层出于性能原因调用其下方的任何层",
      "物理层与逻辑层：层次是逻辑分离；层级是物理部署——两层部署可以实现三层逻辑设计",
      "污水孔反模式：当一层只是将请求传递到下方层而不增加价值时，架构相对其收益变得不必要地复杂"
    ],
    "timeline": [
      [
        "1992",
        "The three-tier client-server model emerges as a mainstream enterprise architecture pattern, separating mainframe presentation, business logic, and data"
      ],
      [
        "2002",
        "Martin Fowler codifies Layered Architecture in Patterns of Enterprise Application Architecture, providing a canonical reference"
      ],
      [
        "2004",
        "Java EE (J2EE) and Spring Framework cement layered architecture as the dominant enterprise Java pattern"
      ],
      [
        "2010",
        "N-Tier becomes the default starting architecture for ASP.NET MVC, Django, and Rails applications across the industry"
      ]
    ],
    "timeline_zh": [
      [
        "1992",
        "三层客户端-服务器模型作为主流企业架构模式出现，分离主机表示、业务逻辑和数据"
      ],
      [
        "2002",
        "Martin Fowler在《企业应用架构模式》中编纂分层架构，提供规范参考"
      ],
      [
        "2004",
        "Java EE（J2EE）和Spring框架将分层架构确立为主流企业Java模式"
      ],
      [
        "2010",
        "N层成为行业中ASP.NET MVC、Django和Rails应用程序的默认起始架构"
      ]
    ],
    "dos": [
      "Define explicit service interfaces between layers so each layer can be tested with a mock of the layer below — this is the key to making a layered application testable",
      "Keep each layer focused on a single concern — a Business Logic layer that also builds SQL queries or HTML is no longer a proper layer",
      "Document which layer owns each responsibility explicitly at project start — ambiguity leads to logic leaking across layers as the team grows",
      "Use dependency injection to supply lower-layer implementations to higher layers — avoid static method calls across layer boundaries which create untestable coupling"
    ],
    "dos_zh": [
      "在层之间定义明确的服务接口，使每层可以用其下方层的模拟进行测试——这是使分层应用程序可测试的关键",
      "保持每层专注于单一关注点——同时构建SQL查询或HTML的业务逻辑层不再是适当的层",
      "在项目开始时明确记录每层拥有哪些职责——歧义会导致随着团队成长逻辑跨层泄漏",
      "使用依赖注入将低层实现供应给高层——避免跨层边界的静态方法调用，这会创建无法测试的耦合"
    ],
    "donts": [
      "Don't allow the Presentation layer to query the database directly — this shortcut erodes the architecture and creates security and maintainability risks",
      "Don't create a single shared domain model that all layers reference — this shared kernel becomes a change magnet that couples all layers to each other",
      "Don't mistake physical deployment tiers for logical layers — a monolith deployed as a single process can still have three clean logical layers",
      "Don't keep adding layers for every new concern — excessive layering (six or more layers) creates sinkhole anti-patterns where most calls pass through unchanged"
    ],
    "donts_zh": [
      "不要允许表示层直接查询数据库——这种捷径侵蚀架构并造成安全和可维护性风险",
      "不要创建所有层都引用的单一共享领域模型——这个共享内核成为变更磁铁，将所有层相互耦合",
      "不要将物理部署层级误认为逻辑层次——作为单个进程部署的单体系统仍然可以有三个清晰的逻辑层",
      "不要为每个新关注点不断添加层次——过度分层（六层或更多层）会产生大多数调用不加改变地传递的污水孔反模式"
    ],
    "case_study_company": "Stack Overflow",
    "case_study": "Stack Overflow famously runs on a classic three-tier layered architecture — ASP.NET MVC presentation layer, C# service layer with business logic, and a SQL Server data layer via Dapper — rather than adopting microservices. This deliberate simplicity, with strict layer discipline, allowed a team of fewer than 20 developers to serve over 100 million monthly visitors. The transparency of the layered model enabled rapid diagnosis of performance bottlenecks (which were almost always in the data layer) and simplified capacity planning. Their architecture has been cited repeatedly as evidence that layered monoliths, when carefully maintained, can dramatically outperform distributed alternatives in developer productivity.",
    "case_study_zh": "Stack Overflow以其经典三层分层架构著称——ASP.NET MVC表示层、包含业务逻辑的C#服务层和通过Dapper的SQL Server数据层——而非采用微服务。这种有意的简洁性，配合严格的层次纪律，使少于20名开发者的团队能够为超过1亿月活访客提供服务。分层模型的透明性使性能瓶颈的快速诊断成为可能（几乎总是在数据层），并简化了容量规划。他们的架构被反复引用为证据，证明经过精心维护的分层单体系统在开发者生产力方面可以大幅超越分布式替代方案。",
    "when_not_to_use": [
      "When the application has fundamentally different scalability requirements per component — e.g., a read-heavy reporting service alongside a write-heavy transaction processor; microservices or CQRS fit better",
      "When business logic is domain-rich and complex — layered architecture tends to produce anemic domain models; Clean Architecture or DDD is more appropriate",
      "When deployment agility is critical — a layered monolith requires coordinated deployments; teams needing independent deploy cadences should consider modular or service-based decomposition"
    ],
    "when_not_to_use_zh": [
      "当应用程序每个组件有根本不同的可扩展性要求时——例如，读取密集型报告服务与写入密集型事务处理器并存；微服务或CQRS更合适",
      "当业务逻辑领域丰富且复杂时——分层架构倾向于产生贫血领域模型；整洁架构或DDD更为合适",
      "当部署敏捷性至关重要时——分层单体需要协调部署；需要独立部署节奏的团队应考虑模块化或基于服务的分解"
    ],
    "adopters": [
      "Java EE / Jakarta EE",
      ".NET / ASP.NET MVC",
      "PHP Laravel / Symfony",
      "Python Django",
      "Stack Overflow"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "scalability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Fowler, M. (2002). Patterns of Enterprise Application Architecture, Chapter 1: Layering. Addison-Wesley.",
    "secondary_sources": [
      "Richards, M. (2015). Software Architecture Patterns. O'Reilly Media.",
      "Microsoft (2010). N-Tier Application Architecture. Microsoft Developer Network (MSDN).",
      "Gamma, E. et al. (1994). Design Patterns: Elements of Reusable Object-Oriented Software. Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "layered-architecture",
        "type": "related"
      },
      {
        "slug": "separation-of-concerns",
        "type": "extends"
      },
      {
        "slug": "modular-monolith",
        "type": "related"
      },
      {
        "slug": "clean-architecture",
        "type": "alternative"
      },
      {
        "slug": "hexagonal-architecture",
        "type": "alternative"
      },
      {
        "slug": "onion-architecture",
        "type": "alternative"
      },
      {
        "slug": "ports-and-adapters",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 322,
    "name": "Microkernel (Plugin) Architecture",
    "name_zh": "微内核（插件）架构",
    "slug": "microkernel-architecture",
    "category": "architecture",
    "desc": "Separates a minimal stable core system from interchangeable plug-in modules, enabling feature extension without modifying the core — the core provides services and a registry; plugins contribute functionality through a defined contract.",
    "desc_zh": "将最小稳定核心系统与可互换插件模块分离，使功能扩展无需修改核心——核心提供服务和注册表；插件通过定义的契约贡献功能。",
    "steps": [
      "Define the core system: the minimal set of functionality required for the system to run — plugin registry, service locator, inter-plugin communication bus, and any essential baseline services",
      "Define the plugin contract: an interface or API specification that all plugins must implement to register with and be invoked by the core — this contract must be stable and versioned",
      "Implement the plugin registry: a mechanism for the core to discover, load, and activate plugins at startup or runtime — this may be class-path scanning, a manifest file, or a service registry",
      "Develop plugins as isolated modules: each plugin implements the defined contract, carries its own dependencies, and performs a specific feature — plugins must not depend on other plugins directly",
      "Define inter-plugin communication: if plugins need to collaborate, they do so through the core's message bus or extension points — never through direct plugin-to-plugin imports"
    ],
    "steps_zh": [
      "定义核心系统：系统运行所需的最小功能集——插件注册表、服务定位器、插件间通信总线和任何基本基线服务",
      "定义插件契约：所有插件必须实现才能向核心注册并被调用的接口或API规范——此契约必须稳定且有版本控制",
      "实现插件注册表：核心在启动或运行时发现、加载和激活插件的机制——这可能是类路径扫描、清单文件或服务注册表",
      "将插件开发为隔离模块：每个插件实现定义的契约，携带自己的依赖项，并执行特定功能——插件不得直接依赖其他插件",
      "定义插件间通信：如果插件需要协作，通过核心的消息总线或扩展点进行——永不通过直接的插件到插件导入"
    ],
    "ai_relevant": false,
    "viz_type": "tree",
    "viz_labels": [
      "Core System",
      "Plugin Registry",
      "Plugin",
      "Extension Point"
    ],
    "viz_labels_zh": [
      "核心系统",
      "插件注册表",
      "插件模块",
      "扩展点"
    ],
    "related": [
      "separation-of-concerns",
      "ports-and-adapters",
      "modular-monolith"
    ],
    "tags": [
      "extensibility",
      "plugin-system",
      "open-closed-principle",
      "modularity"
    ],
    "origin_author": "OS design lineage (Mach microkernel, 1985, Carnegie Mellon); applied to software architecture by Richards and Ford (2015)",
    "origin_source": "Richards, M. (2015). Software Architecture Patterns. O'Reilly Media. (Microkernel Architecture chapter).",
    "origin_source_zh": "Richards, M.（2015）。《软件架构模式》。O'Reilly Media。（微内核架构章节）。",
    "complexity": "intermediate",
    "when_to_use": [
      "When the system must support extensibility by third parties who should not have access to the core codebase — IDEs, editors, CMS platforms",
      "When different customers or deployments need different feature sets assembled from a common component catalog",
      "When the core functionality is stable and well-understood but the peripheral feature set is expected to grow or change frequently",
      "When wanting to enforce the Open-Closed Principle at an architectural level — the core is closed for modification, open for extension via plugins"
    ],
    "when_to_use_zh": [
      "系统必须支持第三方扩展而第三方不应访问核心代码库时——IDE、编辑器、CMS平台",
      "不同客户或部署需要从通用组件目录中组装不同功能集时",
      "核心功能稳定且被充分理解，但外围功能集预计会频繁增长或更改时",
      "希望在架构级别强制执行开闭原则时——核心对修改封闭，通过插件开放扩展"
    ],
    "core_concepts": [
      "Core system: the minimal, stable host application that provides the plugin registry, lifecycle management, and shared services that plugins depend on",
      "Plugin contract: the versioned interface (API, extension point, manifest schema) that defines how plugins register themselves and what services they must provide",
      "Plugin registry: the mechanism by which the core discovers and manages plugins — file-system scanning, dependency injection containers, OSGi bundles, or a remote registry",
      "Isolation: plugins should not share mutable state or import each other's classes directly — communication happens through the core's mediation layer",
      "Versioned contracts: the plugin contract must be versioned to allow the core to evolve without breaking existing plugins — semantic versioning of extension APIs"
    ],
    "core_concepts_zh": [
      "核心系统：最小、稳定的宿主应用程序，提供插件注册表、生命周期管理和插件所依赖的共享服务",
      "插件契约：定义插件如何注册自身以及必须提供哪些服务的版本化接口（API、扩展点、清单模式）",
      "插件注册表：核心发现和管理插件的机制——文件系统扫描、依赖注入容器、OSGi包或远程注册表",
      "隔离：插件不应共享可变状态或直接导入彼此的类——通信通过核心的中介层进行",
      "版本化契约：插件契约必须有版本控制，允许核心在不破坏现有插件的情况下演进——扩展API的语义版本控制"
    ],
    "timeline": [
      [
        "1985",
        "The Mach microkernel at Carnegie Mellon formalizes the core-plugin concept at the OS level, influencing subsequent software architecture thinking"
      ],
      [
        "2001",
        "Eclipse IDE launches its OSGi-based plugin system, becoming the canonical software engineering example of microkernel architecture"
      ],
      [
        "2015",
        "Mark Richards codifies Microkernel Architecture as a named software architecture pattern in Software Architecture Patterns"
      ],
      [
        "2016",
        "VS Code launches with a language-server protocol extension model, demonstrating microkernel principles for modern developer tools"
      ]
    ],
    "timeline_zh": [
      [
        "1985",
        "卡内基梅隆大学的Mach微内核在操作系统层面正式确立核心-插件概念，影响了后续的软件架构思维"
      ],
      [
        "2001",
        "Eclipse IDE推出基于OSGi的插件系统，成为微内核架构的标准软件工程示例"
      ],
      [
        "2015",
        "Mark Richards在《软件架构模式》中将微内核架构编纂为命名的软件架构模式"
      ],
      [
        "2016",
        "VS Code推出语言服务器协议扩展模型，展示现代开发者工具的微内核原则"
      ]
    ],
    "dos": [
      "Design the plugin contract for stability from the beginning — breaking changes to the contract require coordinated updates across all plugins, which is expensive",
      "Give each plugin its own classloader or process sandbox where possible to prevent one plugin's dependency conflicts from affecting others",
      "Version the plugin API semantically and provide compatibility layers for deprecated extension points so that older plugins continue to work across core upgrades",
      "Build an integration test suite that verifies each plugin against the core contract — regression-test the contract, not just the plugin implementations"
    ],
    "dos_zh": [
      "从一开始就为稳定性设计插件契约——对契约的破坏性更改需要跨所有插件的协调更新，代价高昂",
      "尽可能为每个插件提供自己的类加载器或进程沙箱，防止一个插件的依赖冲突影响其他插件",
      "对插件API进行语义版本控制，并为已弃用的扩展点提供兼容层，使旧插件在核心升级后继续工作",
      "构建集成测试套件，验证每个插件是否符合核心契约——对契约进行回归测试，而非仅测试插件实现"
    ],
    "donts": [
      "Don't allow plugins to communicate directly with each other — plugin-to-plugin coupling defeats the extensibility goal and creates hidden dependency graphs",
      "Don't put significant business logic in the core — the core should be minimal; business logic belongs in plugins which can be independently versioned and replaced",
      "Don't neglect plugin lifecycle management — plugins that are not properly initialized, paused, or shut down leak resources and create unstable system states",
      "Don't design the plugin API in a rush — a poorly designed, leaked plugin contract is extremely difficult to evolve because it is effectively a public API commitment"
    ],
    "donts_zh": [
      "不要允许插件直接相互通信——插件间耦合会破坏可扩展性目标并创建隐藏的依赖图",
      "不要在核心中放入重要的业务逻辑——核心应该是最小的；业务逻辑属于插件，可以独立版本化和替换",
      "不要忽视插件生命周期管理——未适当初始化、暂停或关闭的插件会泄漏资源并创建不稳定的系统状态",
      "不要仓促设计插件API——设计不良、泄漏的插件契约极难演进，因为它实际上是公共API承诺"
    ],
    "case_study_company": "Microsoft (VS Code)",
    "case_study": "Visual Studio Code launched in 2016 with a microkernel architecture centered on the Language Server Protocol (LSP). The core VS Code provides editor infrastructure — text rendering, file management, the extension host process — while all language intelligence (IntelliSense, diagnostics, refactoring) is delivered through the extension API. By 2024, the VS Code Marketplace hosts over 50,000 extensions built by third parties. Critically, each extension runs in a separate Node.js process (the Extension Host), preventing a misbehaving extension from crashing the editor core. This isolation model enabled Microsoft to ship language support for over 200 programming languages without any core code changes.",
    "case_study_zh": "Visual Studio Code于2016年推出，采用以语言服务器协议（LSP）为中心的微内核架构。核心VS Code提供编辑器基础设施——文本渲染、文件管理、扩展宿主进程——而所有语言智能（IntelliSense、诊断、重构）通过扩展API提供。到2024年，VS Code市场拥有第三方构建的超过50,000个扩展。关键的是，每个扩展在独立的Node.js进程（扩展宿主）中运行，防止行为异常的扩展使编辑器核心崩溃。这种隔离模型使微软能够在不进行任何核心代码更改的情况下为200多种编程语言提供语言支持。",
    "when_not_to_use": [
      "For simple applications with a fixed, well-known feature set that will not need third-party extensibility — the plugin infrastructure adds complexity without payoff",
      "When tight integration between features is required — the loose coupling between plugins makes cross-plugin workflows awkward and difficult to optimize",
      "When performance is critical and the plugin dispatch overhead (registry lookups, interface calls) is not acceptable — highly optimized systems often need direct coupling"
    ],
    "when_not_to_use_zh": [
      "对于具有固定、已知功能集且不需要第三方可扩展性的简单应用程序——插件基础设施增加复杂性而无收益",
      "当功能之间需要紧密集成时——插件之间的松耦合使跨插件工作流程变得笨拙且难以优化",
      "当性能至关重要且插件分发开销（注册表查找、接口调用）不可接受时——高度优化的系统通常需要直接耦合"
    ],
    "adopters": [
      "Eclipse IDE",
      "Visual Studio Code",
      "Webpack",
      "WordPress",
      "Google Chrome"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "portability"
    ],
    "maturity_ring": "established",
    "primary_source": "Richards, M. (2015). Software Architecture Patterns. O'Reilly Media.",
    "secondary_sources": [
      "Ford, N. & Richards, M. (2020). Fundamentals of Software Architecture. O'Reilly Media.",
      "Microsoft (2016). VS Code Extension API. code.visualstudio.com/api.",
      "Eclipse Foundation (2003). Eclipse Platform Architecture Overview. eclipse.org."
    ],
    "typed_relations": [
      {
        "slug": "separation-of-concerns",
        "type": "extends"
      },
      {
        "slug": "ports-and-adapters",
        "type": "related"
      },
      {
        "slug": "modular-monolith",
        "type": "related"
      }
    ]
  },
  {
    "id": 323,
    "name": "Flux / Unidirectional Data Flow",
    "name_zh": "Flux / 单向数据流",
    "slug": "flux-unidirectional",
    "category": "architecture",
    "desc": "Enforces a strict one-way data cycle — Action → Dispatcher → Store → View → Action — eliminating the cascading update problems of two-way binding by making state changes predictable and traceable.",
    "desc_zh": "强制执行严格的单向数据循环——动作→调度器→存储→视图→动作——通过使状态变更可预测和可追踪，消除双向绑定的级联更新问题。",
    "steps": [
      "Define Actions: plain data objects (or objects with a type string) that describe what happened in the system — user interactions, API responses, timer events",
      "Implement a Dispatcher: a singleton registry that receives all Actions and broadcasts them sequentially to all registered Stores — no action may dispatch another action synchronously",
      "Implement Stores: modules that hold application state and business logic; each Store registers with the Dispatcher, handles the Actions it cares about, and emits a change event when its state updates",
      "Implement Views: React components or UI elements that subscribe to Store change events, re-render from Store state, and emit new Actions in response to user gestures",
      "Close the loop: user interaction → View emits Action → Dispatcher broadcasts → Stores update → Views re-render; debug by inspecting the Action log for the full state history"
    ],
    "steps_zh": [
      "定义动作：描述系统中发生了什么的普通数据对象（或带有类型字符串的对象）——用户交互、API响应、定时器事件",
      "实现调度器：接收所有动作并将其顺序广播到所有注册存储的单例注册表——任何动作不得同步分发另一个动作",
      "实现存储：持有应用程序状态和业务逻辑的模块；每个存储向调度器注册，处理其关心的动作，并在状态更新时发出变更事件",
      "实现视图：订阅存储变更事件、从存储状态重新渲染、并响应用户手势发出新动作的React组件或UI元素",
      "闭合循环：用户交互→视图发出动作→调度器广播→存储更新→视图重新渲染；通过检查动作日志获取完整状态历史来调试"
    ],
    "ai_relevant": false,
    "viz_type": "cycle",
    "viz_labels": [
      "Action",
      "Dispatcher",
      "Store",
      "View"
    ],
    "viz_labels_zh": [
      "动作",
      "调度器",
      "数据仓库",
      "视图"
    ],
    "related": [
      "observer-pattern",
      "eda",
      "separation-of-concerns",
      "mvc",
      "mvvm",
      "mvp"
    ],
    "tags": [
      "state-management",
      "unidirectional",
      "react",
      "frontend-architecture"
    ],
    "origin_author": "Facebook (Jing Chen, Bill Fisher), 2014",
    "origin_source": "Chen, J. & Fisher, B. (2014). Flux: An Application Architecture for React. Facebook Engineering talk at F8 conference, May.",
    "origin_source_zh": "Chen, J. & Fisher, B.（2014）。《Flux：React的应用架构》。Facebook工程师在F8大会上的演讲，5月。",
    "complexity": "intermediate",
    "when_to_use": [
      "When a complex client-side application has shared state that multiple UI components read and mutate, leading to synchronization bugs with two-way binding",
      "When debugging complex UI bugs that stem from unclear data flow — Flux's action log makes the sequence of state changes fully auditable",
      "When using React or a similar component-based framework where unidirectional data flow is the idiomatic state management approach",
      "When the team needs to implement time-travel debugging or undo/redo — the action log is a natural event source for replaying state"
    ],
    "when_to_use_zh": [
      "复杂客户端应用程序具有多个UI组件读写的共享状态，导致双向绑定的同步错误时",
      "调试源于不清晰数据流的复杂UI错误时——Flux的动作日志使状态变更序列完全可审计",
      "使用React或类似基于组件的框架时，单向数据流是惯用的状态管理方法",
      "团队需要实现时间旅行调试或撤销/重做时——动作日志是重放状态的自然事件源"
    ],
    "core_concepts": [
      "Unidirectional flow: data moves in one direction only — Action → Dispatcher → Store → View → Action — eliminating the feedback loops that make bidirectional binding hard to debug",
      "Actions: the only mechanism for changing state — explicit, serializable objects that describe what happened, never how the state should change",
      "Dispatcher: the global event bus that serializes all Actions and broadcasts them to every registered Store — enforcing that only one Action is processed at a time",
      "Stores: stateful modules that own specific slices of application state; they respond to Actions by computing new state and notifying Views",
      "Single source of truth: each piece of application state lives in exactly one Store — Views derive their display state from Stores rather than maintaining their own copies"
    ],
    "core_concepts_zh": [
      "单向流：数据仅在一个方向移动——动作→调度器→存储→视图→动作——消除使双向绑定难以调试的反馈循环",
      "动作：改变状态的唯一机制——明确的、可序列化的描述发生了什么的对象，而非状态应如何改变",
      "调度器：将所有动作序列化并广播到每个注册存储的全局事件总线——强制一次只处理一个动作",
      "存储：拥有特定应用程序状态切片的有状态模块；通过计算新状态并通知视图来响应动作",
      "单一数据源：每片应用程序状态恰好存在于一个存储中——视图从存储派生其显示状态而非维护自己的副本"
    ],
    "timeline": [
      [
        "2014",
        "Facebook's Jing Chen introduces Flux at the F8 developer conference as the solution to cascading MVC update bugs in Facebook Chat"
      ],
      [
        "2015",
        "Dan Abramov releases Redux, a simplified Flux implementation with a single immutable Store and pure reducer functions, which becomes the dominant React state library"
      ],
      [
        "2016",
        "Vue.js releases Vuex, bringing Flux-inspired unidirectional state management to the Vue ecosystem"
      ],
      [
        "2019",
        "React introduces the useReducer hook and Context API, making lightweight Flux-style state management available natively without external libraries"
      ]
    ],
    "timeline_zh": [
      [
        "2014",
        "Facebook的Jing Chen在F8开发者大会上介绍Flux，作为解决Facebook Chat中级联MVC更新错误的方案"
      ],
      [
        "2015",
        "Dan Abramov发布Redux，一个具有单一不可变存储和纯reducer函数的简化Flux实现，成为主流React状态库"
      ],
      [
        "2016",
        "Vue.js发布Vuex，为Vue生态系统带来受Flux启发的单向状态管理"
      ],
      [
        "2019",
        "React引入useReducer钩子和Context API，使轻量级Flux风格的状态管理无需外部库即可原生使用"
      ]
    ],
    "dos": [
      "Keep Actions as plain, serializable data objects with a type field — this makes them loggable, replayable, and easy to test",
      "Make Store update logic pure and synchronous — asynchronous operations belong in Action creators or middleware, not inside Store handlers",
      "Define all possible Actions as an explicit set (string enum or constant map) so that typos in action type strings are caught at development time",
      "Derive computed state in Views (or selectors) rather than storing redundant derived data in Stores — storing derived data creates synchronization bugs"
    ],
    "dos_zh": [
      "将动作保持为带有类型字段的普通、可序列化数据对象——这使其可记录、可重放且易于测试",
      "使存储更新逻辑纯粹且同步——异步操作属于动作创建器或中间件，而非存储处理器内部",
      "将所有可能的动作定义为明确集合（字符串枚举或常量映射），以便在开发时捕获动作类型字符串中的拼写错误",
      "在视图（或选择器）中派生计算状态，而非在存储中存储冗余派生数据——存储派生数据会创建同步错误"
    ],
    "donts": [
      "Don't dispatch Actions from within a Store's Action handler — the Dispatcher enforces that only one Action is in-flight at a time; nested dispatches throw errors",
      "Don't let Views mutate Store state directly — all state changes must go through the Action → Dispatcher → Store cycle, even for local UI state",
      "Don't create one mega-Store for all application state — split Stores by domain boundary (UserStore, CartStore, NotificationStore) to limit the scope of re-renders",
      "Don't perform async operations (API calls, timers) inside the Dispatcher's callback chain — use Action creators with async/await or middleware for side effects"
    ],
    "donts_zh": [
      "不要在存储的动作处理器内分发动作——调度器强制一次只有一个动作在处理中；嵌套分发会抛出错误",
      "不要让视图直接修改存储状态——所有状态变更必须经过动作→调度器→存储循环，即使对于本地UI状态",
      "不要为所有应用程序状态创建一个巨型存储——按领域边界拆分存储（UserStore、CartStore、NotificationStore）以限制重新渲染范围",
      "不要在调度器回调链内执行异步操作（API调用、定时器）——为副作用使用带async/await的动作创建器或中间件"
    ],
    "case_study_company": "Facebook",
    "case_study": "Facebook developed Flux to solve a specific, devastating bug in Facebook Chat: messages would be marked as read, but an unread count badge would persist showing incorrect counts. Root cause analysis revealed MVC's bidirectional data flow allowed controller updates to trigger cascading model updates, which triggered further controller updates — creating an unpredictable cycle of state changes. By replacing the bidirectional flow with a strict Action → Dispatcher → Store → View cycle, Facebook engineers made the data flow explicit and debuggable. After Flux adoption, the Chat unread count bug was eliminated and the team reported a significant reduction in state-synchronization bugs across the Facebook web application.",
    "case_study_zh": "Facebook开发Flux是为了解决Facebook Chat中一个特定的严重错误：消息被标记为已读，但未读计数徽章持续显示错误计数。根因分析揭示MVC的双向数据流允许控制器更新触发级联模型更新，后者又触发进一步的控制器更新——创建不可预测的状态变更循环。通过用严格的动作→调度器→存储→视图循环替换双向流，Facebook工程师使数据流变得明确且可调试。采用Flux后，Chat未读计数错误被消除，团队报告Facebook Web应用程序中状态同步错误显著减少。",
    "when_not_to_use": [
      "For simple applications with local component state only — React's useState hook or Vue's reactive data handles this without the Flux overhead",
      "When the team is not using a component-based UI framework — Flux's View layer assumes a component model; it does not translate well to server-rendered HTML",
      "For applications with predominantly server-side state — Flux manages client-side state; server state libraries (React Query, SWR) are a better fit for data fetching"
    ],
    "when_not_to_use_zh": [
      "对于仅有本地组件状态的简单应用程序——React的useState钩子或Vue的响应式数据可以处理这些而无需Flux开销",
      "当团队不使用基于组件的UI框架时——Flux的视图层假定组件模型；它不能很好地转化为服务器渲染的HTML",
      "对于以服务器端状态为主的应用程序——Flux管理客户端状态；服务器状态库（React Query、SWR）更适合数据获取"
    ],
    "adopters": [
      "Facebook / Meta",
      "React + Redux ecosystem",
      "Vuex (Vue.js)",
      "NgRx (Angular)",
      "Elm (inspired by Flux)"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability",
      "testability"
    ],
    "maturity_ring": "established",
    "primary_source": "Chen, J. & Fisher, B. (2014). Flux: An Application Architecture for React. Facebook Engineering, F8 Conference.",
    "secondary_sources": [
      "Abramov, D. (2015). Redux Documentation — Motivation. redux.js.org.",
      "Facebook (2014). Flux — Application Architecture for Building User Interfaces. github.com/facebook/flux.",
      "Abramov, D. & Clark, A. (2016). Getting Started with Redux (Egghead.io course). egghead.io."
    ],
    "typed_relations": [
      {
        "slug": "observer-pattern",
        "type": "extends"
      },
      {
        "slug": "eda",
        "type": "related"
      },
      {
        "slug": "separation-of-concerns",
        "type": "complement"
      },
      {
        "slug": "mvc",
        "type": "alternative"
      },
      {
        "slug": "mvvm",
        "type": "alternative"
      },
      {
        "slug": "mvp",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 29,
    "name": "SOLID Principles",
    "name_zh": "SOLID 原则",
    "slug": "solid-principles",
    "category": "coding",
    "desc": "Five OOP design principles for maintainable, flexible code",
    "desc_zh": "面向对象设计的五大原则，打造可维护、灵活的代码",
    "steps": [
      "Single Responsibility: ensure each class or module has exactly one reason to change by isolating concerns into separate units",
      "Open/Closed: design modules that are open for extension (via interfaces, plugins) but closed for modification of existing code",
      "Liskov Substitution: verify that subtypes can replace their base types without altering the correctness of the program",
      "Interface Segregation: split large interfaces into smaller, client-specific ones so implementers are not forced to depend on unused methods",
      "Dependency Inversion: depend on abstractions rather than concrete implementations; inject dependencies through constructors or configuration"
    ],
    "steps_zh": [
      "单一职责：确保每个类或模块只有一个变更原因，将关注点隔离到独立单元中",
      "开闭原则：设计对扩展开放（通过接口、插件）但对现有代码修改封闭的模块",
      "里氏替换：验证子类型可以替换其基类型而不改变程序的正确性",
      "接口隔离：将大接口拆分为更小的客户端专用接口，避免实现者依赖未使用的方法",
      "依赖倒置：依赖抽象而非具体实现；通过构造函数或配置注入依赖"
    ],
    "ai_relevant": false,
    "viz_type": "pyramid",
    "viz_labels": [
      "Single Responsibility",
      "Open-Closed",
      "Liskov Substitution",
      "Interface Segregation",
      "Dependency Inversion"
    ],
    "viz_labels_zh": [
      "单一职责",
      "开闭原则",
      "里氏替换",
      "接口隔离",
      "依赖反转"
    ],
    "related": [
      "clean-code-principles",
      "grasp-patterns",
      "dependency-injection"
    ],
    "tags": [
      "solid",
      "oop",
      "design-principles",
      "maintainability"
    ],
    "origin_author": "Robert C. Martin, 2000",
    "origin_source": "Design Principles and Design Patterns",
    "origin_source_zh": "《设计原则与设计模式》",
    "complexity": "intermediate",
    "when_to_use": [
      "Designing object-oriented systems that need to evolve over time",
      "Refactoring legacy codebases to improve modularity and testability",
      "Establishing coding standards for a team working on shared libraries",
      "Building plugin-based architectures that require extensibility"
    ],
    "when_to_use_zh": [
      "设计需要长期演进的面向对象系统",
      "重构遗留代码库以提升模块化和可测试性",
      "为共同维护共享库的团队建立编码规范",
      "构建需要可扩展性的插件式架构"
    ],
    "core_concepts": [
      "Single Responsibility Principle: a class should have only one reason to change, aligning each module with a single actor or stakeholder",
      "Open/Closed Principle: software entities should be open for extension but closed for modification, enabling new behavior without touching existing code",
      "Liskov Substitution Principle: objects of a superclass should be replaceable with objects of a subclass without breaking program correctness",
      "Interface Segregation Principle: clients should not be forced to depend on interfaces they do not use",
      "Dependency Inversion Principle: high-level modules should depend on abstractions, not on low-level concrete implementations"
    ],
    "core_concepts_zh": [
      "单一职责原则：一个类应当只有一个变更原因，使每个模块对应一个参与者或利益相关方",
      "开闭原则：软件实体应对扩展开放、对修改封闭，在不触碰已有代码的前提下引入新行为",
      "里氏替换原则：超类的对象应当可以被其子类的对象替换而不破坏程序正确性",
      "接口隔离原则：客户端不应被迫依赖其不使用的接口",
      "依赖倒置原则：高层模块应依赖抽象，而非低层具体实现"
    ],
    "timeline": [
      [
        "1988",
        "Barbara Liskov formally defines the Liskov Substitution Principle"
      ],
      [
        "2000",
        "Robert C. Martin publishes 'Design Principles and Design Patterns' introducing SOLID as a set"
      ],
      [
        "2003",
        "Martin's 'Agile Software Development' book popularizes SOLID in the agile community"
      ],
      [
        "2009",
        "Michael Feathers coins the SOLID acronym, making the principles easier to remember and teach"
      ]
    ],
    "timeline_zh": [
      [
        "1988",
        "Barbara Liskov 正式定义里氏替换原则"
      ],
      [
        "2000",
        "Robert C. Martin 发表《设计原则与设计模式》，首次将 SOLID 作为一组原则提出"
      ],
      [
        "2003",
        "Martin 的《敏捷软件开发》一书在敏捷社区中推广了 SOLID"
      ],
      [
        "2009",
        "Michael Feathers 创造了 SOLID 缩写，使这些原则更易于记忆和传授"
      ]
    ],
    "dos": [
      "Do favor composition over inheritance because it naturally aligns with SRP and reduces tight coupling",
      "Do define interfaces from the client's perspective because this ensures ISP compliance",
      "Do inject dependencies through constructors because it makes classes testable and explicit about their needs",
      "Do apply SOLID incrementally during refactoring because premature abstraction adds unnecessary complexity"
    ],
    "dos_zh": [
      "优先使用组合而非继承，因为组合天然符合单一职责并减少紧耦合",
      "从客户端视角定义接口，因为这能确保符合接口隔离原则",
      "通过构造函数注入依赖，因为这使类可测试且明确表达其需求",
      "在重构过程中渐进式地应用 SOLID，因为过早抽象会增加不必要的复杂性"
    ],
    "donts": [
      "Don't create god classes that handle multiple unrelated responsibilities because they become change magnets",
      "Don't violate LSP by throwing unexpected exceptions in subclasses because callers rely on base type contracts",
      "Don't build fat interfaces that force implementers to write empty stub methods because it creates confusing APIs",
      "Don't apply SOLID dogmatically to simple scripts because the overhead outweighs the benefits in small codebases"
    ],
    "donts_zh": [
      "不要创建处理多个不相关职责的上帝类，因为它们会成为变更磁铁",
      "不要在子类中抛出意外异常来违反 LSP，因为调用方依赖基类型契约",
      "不要构建迫使实现者编写空桩方法的臃肿接口，因为这会造成令人困惑的 API",
      "不要对简单脚本教条式地应用 SOLID，因为在小型代码库中其开销大于收益"
    ],
    "case_study_company": "Spotify",
    "case_study": "Spotify's backend squad model relies heavily on SOLID principles. Each microservice is designed with single responsibility in mind, and dependency inversion allows squads to swap out infrastructure adapters (e.g., switching message brokers) without modifying core domain logic. This architectural discipline enables over 200 squads to deploy independently multiple times per day.",
    "case_study_zh": "Spotify 的后端小队模式深度依赖 SOLID 原则。每个微服务都以单一职责为核心设计，依赖倒置使各小队能够替换基础设施适配器（如切换消息中间件）而无需修改核心领域逻辑。这种架构纪律让超过 200 个小队能够每天独立部署多次。",
    "when_not_to_use": [
      "Throwaway prototypes or spike experiments where speed matters more than design quality",
      "Very small scripts or single-file utilities with no anticipated future changes",
      "Performance-critical inner loops where abstraction layers add unacceptable overhead",
      "Functional programming codebases where SOLID's OOP assumptions do not apply"
    ],
    "when_not_to_use_zh": [
      "一次性原型或探针实验，速度比设计质量更重要时",
      "非常小的脚本或不预期未来变更的单文件工具",
      "性能关键的内层循环，抽象层会带来不可接受的开销",
      "函数式编程代码库，SOLID 的面向对象假设不适用"
    ],
    "adopters": [
      "Microsoft",
      "Google",
      "Spotify",
      "Thoughtworks",
      "JetBrains"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Robert C. Martin (2000). \"Design Principles and Design Patterns\". objectmentor.com.",
    "secondary_sources": [
      "Robert C. Martin (2003). \"Agile Software Development, Principles, Patterns, and Practices\". Prentice Hall.",
      "Robert C. Martin (2017). \"Clean Architecture: A Craftsman's Guide to Software Structure and Design\". Prentice Hall."
    ],
    "typed_relations": [
      {
        "slug": "clean-code-principles",
        "type": "complement"
      },
      {
        "slug": "grasp-patterns",
        "type": "complement"
      },
      {
        "slug": "dependency-injection",
        "type": "complement"
      }
    ]
  },
  {
    "id": 30,
    "name": "GRASP Patterns",
    "name_zh": "GRASP 模式",
    "slug": "grasp-patterns",
    "category": "coding",
    "desc": "Nine patterns for assigning responsibility to classes properly",
    "desc_zh": "九种将职责正确分配给类的通用模式",
    "steps": [
      "Apply Information Expert: assign each responsibility to the class that has the information necessary to fulfill it",
      "Use Creator pattern: give class B the responsibility to create class A if B contains, aggregates, or closely uses A",
      "Enforce Low Coupling and High Cohesion: minimize dependencies between classes while keeping related behaviors together",
      "Apply Controller: assign system event handling to a use-case controller or facade that delegates to domain objects",
      "Use Polymorphism and Indirection: handle variation through polymorphic dispatch and introduce intermediary objects to reduce direct coupling"
    ],
    "steps_zh": [
      "应用信息专家：将每个职责分配给拥有完成该职责所需信息的类",
      "使用创建者模式：如果类B包含、聚合或密切使用类A，则由B负责创建A",
      "强制低耦合与高内聚：最小化类间依赖，同时将相关行为保持在一起",
      "应用控制器：将系统事件处理分配给用例控制器或门面，再委托给领域对象",
      "使用多态与间接：通过多态分派处理变化，引入中介对象减少直接耦合"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Info Expert",
      "Creator",
      "Controller",
      "Low Coupling",
      "High Cohesion"
    ],
    "viz_labels_zh": [
      "信息专家",
      "创建者",
      "控制器",
      "低耦合",
      "高内聚"
    ],
    "related": [
      "solid-principles",
      "gof-design-patterns",
      "clean-code-principles"
    ],
    "tags": [
      "grasp",
      "responsibility-assignment",
      "oop",
      "coupling",
      "cohesion"
    ],
    "origin_author": "Craig Larman, 2004",
    "origin_source": "Applying UML and Patterns: An Introduction to Object-Oriented Analysis and Design and Iterative Development",
    "origin_source_zh": "《UML和模式应用：面向对象分析与设计及迭代开发导论》",
    "complexity": "intermediate",
    "when_to_use": [
      "Deciding which class should own a particular method or behavior",
      "Teaching junior developers how to think about responsibility assignment in OOP",
      "Reviewing domain models to identify misplaced logic or god objects",
      "Designing new classes during iterative object-oriented analysis"
    ],
    "when_to_use_zh": [
      "决定哪个类应拥有特定方法或行为时",
      "教导初级开发者如何在面向对象编程中思考职责分配",
      "审查领域模型以识别放置不当的逻辑或上帝对象",
      "在迭代式面向对象分析中设计新类"
    ],
    "core_concepts": [
      "Information Expert: assign responsibility to the class that has the data needed to fulfill it, reducing data transfer between objects",
      "Creator: determine which class should create instances of another based on containment, aggregation, or close usage relationships",
      "Low Coupling: minimize interdependencies between classes so that changes in one class have minimal ripple effects",
      "High Cohesion: keep each class focused on a narrow, well-defined set of related responsibilities",
      "Controller: assign the responsibility for handling system events to a non-UI object that coordinates the use case"
    ],
    "core_concepts_zh": [
      "信息专家：将职责分配给拥有完成该职责所需数据的类，减少对象间的数据传递",
      "创建者：根据包含、聚合或密切使用关系确定哪个类应负责创建另一个类的实例",
      "低耦合：最小化类间的相互依赖，使一个类的变更对其他类的影响最小",
      "高内聚：保持每个类专注于范围窄小、定义良好的相关职责集合",
      "控制器：将系统事件处理职责分配给协调用例的非 UI 对象"
    ],
    "timeline": [
      [
        "2001",
        "Craig Larman introduces GRASP in the first edition of 'Applying UML and Patterns'"
      ],
      [
        "2004",
        "Third edition of the book consolidates all nine patterns and becomes a university textbook standard"
      ],
      [
        "2010",
        "GRASP gains renewed interest as complement to SOLID in enterprise Java and C# communities"
      ],
      [
        "2015",
        "GRASP patterns are adapted for microservice responsibility assignment in distributed systems literature"
      ]
    ],
    "timeline_zh": [
      [
        "2001",
        "Craig Larman 在《UML和模式应用》第一版中引入 GRASP"
      ],
      [
        "2004",
        "第三版整合了全部九种模式，成为高校教材标准"
      ],
      [
        "2010",
        "GRASP 在企业级 Java 和 C# 社区中作为 SOLID 的补充再次受到关注"
      ],
      [
        "2015",
        "GRASP 模式被改编用于分布式系统文献中的微服务职责分配"
      ]
    ],
    "dos": [
      "Do start with Information Expert when unsure where to place a method because it naturally reduces coupling",
      "Do use Controller pattern to separate UI from business logic because it enables independent testing",
      "Do evaluate cohesion when a class grows beyond five methods because low cohesion indicates mixed responsibilities",
      "Do combine GRASP with SOLID because they are complementary and reinforce each other"
    ],
    "dos_zh": [
      "当不确定方法应放在哪里时，优先用信息专家模式，因为它自然地减少耦合",
      "使用控制器模式将 UI 与业务逻辑分离，因为这能实现独立测试",
      "当一个类的方法超过五个时评估内聚性，因为低内聚表明职责混杂",
      "将 GRASP 与 SOLID 结合使用，因为它们互为补充、相互强化"
    ],
    "donts": [
      "Don't treat GRASP as rigid rules because they are guidelines requiring contextual judgment",
      "Don't assign creation responsibility to a class just because it is convenient without evaluating aggregation relationships",
      "Don't over-apply Indirection by adding unnecessary mediator classes because it increases complexity without benefit",
      "Don't ignore GRASP when doing domain modeling because it prevents common mistakes in responsibility assignment"
    ],
    "donts_zh": [
      "不要将 GRASP 视为死板的规则，因为它们是需要结合上下文判断的指导原则",
      "不要仅因方便就将创建职责分配给某个类，而不评估聚合关系",
      "不要通过添加不必要的中介类来过度应用间接模式，因为这会增加复杂性而无收益",
      "不要在领域建模时忽视 GRASP，因为它能防止职责分配中的常见错误"
    ],
    "case_study_company": "Ericsson",
    "case_study": "Ericsson's telecom platform team adopted GRASP patterns when restructuring their legacy billing system. By systematically applying Information Expert and Creator patterns, they reduced the number of cross-module dependencies by 40% and shortened the average time to implement new billing features from three weeks to five days.",
    "case_study_zh": "爱立信的电信平台团队在重构遗留计费系统时采用了 GRASP 模式。通过系统性地应用信息专家和创建者模式，他们将跨模块依赖减少了 40%，新计费功能的平均实现周期从三周缩短到五天。",
    "when_not_to_use": [
      "Purely functional codebases where objects and classes are not the primary abstraction",
      "Rapid prototyping where responsibility placement will be revisited soon",
      "Data-centric CRUD applications where anemic models with service layers are the deliberate choice",
      "Very small projects with only a handful of classes where the overhead of formal patterns is unnecessary"
    ],
    "when_not_to_use_zh": [
      "纯函数式代码库，对象和类不是主要抽象手段",
      "快速原型开发阶段，职责分配很快会被重新审视",
      "以数据为中心的 CRUD 应用，贫血模型配合服务层是刻意的选择",
      "仅有少数几个类的极小项目，正式模式的开销没有必要"
    ],
    "adopters": [
      "Ericsson",
      "SAP",
      "Oracle",
      "Siemens",
      "IBM"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Craig Larman (2004). \"Applying UML and Patterns: An Introduction to Object-Oriented Analysis and Design and Iterative Development, 3rd Edition\". Prentice Hall.",
    "secondary_sources": [
      "Craig Larman (2002). \"Applying UML and Patterns, 2nd Edition\". Prentice Hall.",
      "Rebecca Wirfs-Brock and Alan McKean (2002). \"Object Design: Roles, Responsibilities, and Collaborations\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "gof-design-patterns",
        "type": "complement"
      },
      {
        "slug": "clean-code-principles",
        "type": "complement"
      }
    ]
  },
  {
    "id": 31,
    "name": "GoF Design Patterns",
    "name_zh": "GoF 设计模式",
    "slug": "gof-design-patterns",
    "category": "coding",
    "desc": "23 classic creational, structural, behavioral design patterns",
    "desc_zh": "23种经典的创建型、结构型、行为型设计模式",
    "steps": [
      "Identify the design problem category: is it about object creation (Creational), composition (Structural), or interaction (Behavioral)?",
      "Select candidate patterns: match the problem to patterns like Factory, Strategy, Observer, Decorator, or Adapter based on the forces at play",
      "Study the pattern's structure: understand participants (classes/interfaces), collaborations, and the consequences (trade-offs) of the pattern",
      "Implement with modern idioms: adapt the classic pattern to your language's features (closures, generics, protocols) rather than copying Java-era UML",
      "Document pattern usage: record in code comments or ADRs why the pattern was chosen and what alternatives were considered"
    ],
    "steps_zh": [
      "识别设计问题类别：是关于对象创建（创建型）、组合（结构型）还是交互（行为型）？",
      "选择候选模式：根据问题中的力量将其匹配到工厂、策略、观察者、装饰器或适配器等模式",
      "研究模式结构：理解参与者（类/接口）、协作方式以及模式的后果（权衡）",
      "用现代惯用法实现：将经典模式适配到你的语言特性（闭包、泛型、协议），而非照搬Java时代的UML",
      "记录模式使用：在代码注释或ADR中记录选择该模式的原因及考虑过的替代方案"
    ],
    "ai_relevant": false,
    "viz_type": "tree",
    "viz_labels": [
      "Creational",
      "Structural",
      "Behavioral",
      "Pattern",
      "Intent"
    ],
    "viz_labels_zh": [
      "创建型",
      "结构型",
      "行为型",
      "模式",
      "意图"
    ],
    "related": [
      "solid-principles",
      "grasp-patterns",
      "hexagonal-architecture"
    ],
    "tags": [
      "design-patterns",
      "gof",
      "creational",
      "structural",
      "behavioral"
    ],
    "origin_author": "Gang of Four (Erich Gamma, Richard Helm, Ralph Johnson, John Vlissides), 1994",
    "origin_source": "Design Patterns: Elements of Reusable Object-Oriented Software",
    "origin_source_zh": "《设计模式：可复用面向对象软件的基础》",
    "complexity": "intermediate",
    "when_to_use": [
      "Solving recurring design problems in object-oriented software with proven solutions",
      "Establishing a shared design vocabulary across a development team",
      "Decoupling systems by introducing abstraction layers for creation, structure, or behavior",
      "Evolving a codebase to handle new requirements without rewriting existing modules"
    ],
    "when_to_use_zh": [
      "用经过验证的方案解决面向对象软件中反复出现的设计问题",
      "在开发团队中建立共享的设计词汇表",
      "通过引入创建、结构或行为的抽象层来解耦系统",
      "演进代码库以处理新需求，而不重写已有模块"
    ],
    "core_concepts": [
      "Creational Patterns: abstract the instantiation process to make systems independent of how objects are created (Factory Method, Abstract Factory, Builder, Prototype, Singleton)",
      "Structural Patterns: describe how classes and objects are composed to form larger structures (Adapter, Bridge, Composite, Decorator, Facade, Flyweight, Proxy)",
      "Behavioral Patterns: define how objects interact and distribute responsibility (Strategy, Observer, Command, Iterator, Mediator, State, Template Method, Visitor, Chain of Responsibility, Memento, Interpreter)",
      "Program to an interface, not an implementation: depend on abstract types so concrete classes can be swapped",
      "Favor composition over inheritance: assemble behavior by composing objects rather than extending class hierarchies"
    ],
    "core_concepts_zh": [
      "创建型模式：抽象实例化过程，使系统独立于对象的创建方式（工厂方法、抽象工厂、建造者、原型、单例）",
      "结构型模式：描述类和对象如何组合形成更大的结构（适配器、桥接、组合、装饰器、外观、享元、代理）",
      "行为型模式：定义对象如何交互和分配职责（策略、观察者、命令、迭代器、中介者、状态、模板方法、访问者、职责链、备忘录、解释器）",
      "面向接口编程而非面向实现：依赖抽象类型，使具体类可以互换",
      "优先使用组合而非继承：通过组合对象来装配行为，而非扩展类继承层次"
    ],
    "timeline": [
      [
        "1987",
        "Ward Cunningham and Kent Beck begin cataloging Smalltalk design patterns inspired by Christopher Alexander's architectural patterns"
      ],
      [
        "1994",
        "The 'Gang of Four' publish 'Design Patterns', establishing 23 canonical patterns"
      ],
      [
        "2004",
        "Joshua Kerievsky publishes 'Refactoring to Patterns', bridging the gap between patterns and refactoring"
      ],
      [
        "2014",
        "Modern language features (lambdas, generics) reduce boilerplate, leading to simpler pattern implementations"
      ]
    ],
    "timeline_zh": [
      [
        "1987",
        "Ward Cunningham 和 Kent Beck 受 Christopher Alexander 建筑模式启发，开始编目 Smalltalk 设计模式"
      ],
      [
        "1994",
        "「四人帮」出版《设计模式》，确立 23 种经典模式"
      ],
      [
        "2004",
        "Joshua Kerievsky 出版《重构与模式》，架起了模式与重构之间的桥梁"
      ],
      [
        "2014",
        "现代语言特性（lambda、泛型）减少了模板代码，使模式实现更加简洁"
      ]
    ],
    "dos": [
      "Do learn the intent of each pattern before memorizing its structure because understanding the 'why' prevents misapplication",
      "Do adapt patterns to your language's idioms because a Python Strategy pattern looks very different from a Java one",
      "Do combine patterns when needed because real-world problems often require Factory + Strategy or Decorator + Composite",
      "Do document which pattern you used and why because future maintainers need context to evolve the design"
    ],
    "dos_zh": [
      "在记住模式结构之前先理解其意图，因为理解「为什么」能防止误用",
      "将模式适配到你所用语言的惯用法，因为 Python 的策略模式与 Java 的看起来截然不同",
      "在需要时组合使用多个模式，因为现实问题通常需要工厂+策略或装饰器+组合",
      "记录你使用了哪个模式及其原因，因为未来的维护者需要上下文来演进设计"
    ],
    "donts": [
      "Don't force patterns where a simple function call suffices because over-engineering reduces readability",
      "Don't use Singleton as a global variable because it hides dependencies and makes testing difficult",
      "Don't apply patterns preemptively because YAGNI often applies — wait for the need to emerge",
      "Don't treat the 23 GoF patterns as exhaustive because many modern patterns (Null Object, Service Locator) emerged later"
    ],
    "donts_zh": [
      "不要在简单函数调用即可满足需求时强行套用模式，因为过度工程降低可读性",
      "不要将单例当作全局变量使用，因为它隐藏了依赖关系且使测试困难",
      "不要预防性地应用模式，因为 YAGNI 原则通常适用——等需求出现再说",
      "不要将 23 种 GoF 模式视为完整集合，因为许多现代模式（空对象、服务定位器）是后来出现的"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix extensively uses GoF patterns across its microservice ecosystem. The API Gateway employs the Facade pattern to simplify backend complexity for client devices, while the Hystrix library (now succeeded by Resilience4j) implemented the Command pattern to wrap every service call with circuit-breaking and fallback behavior. This pattern-driven approach has been key to achieving 99.99% availability.",
    "case_study_zh": "Netflix 在其微服务生态系统中广泛使用 GoF 模式。API 网关采用外观模式简化客户端设备面对的后端复杂性，而 Hystrix 库（现已被 Resilience4j 接替）实现了命令模式来为每个服务调用包装熔断和降级行为。这种模式驱动的方法是实现 99.99% 可用性的关键。",
    "when_not_to_use": [
      "Simple CRUD applications where patterns add unnecessary abstraction layers",
      "Functional programming contexts where higher-order functions replace most behavioral patterns",
      "One-off scripts or automation tasks where design longevity is irrelevant",
      "Early-stage prototypes where premature pattern application slows exploration"
    ],
    "when_not_to_use_zh": [
      "简单的 CRUD 应用，模式会添加不必要的抽象层",
      "函数式编程场景，高阶函数替代了大多数行为型模式",
      "一次性脚本或自动化任务，设计持久性无关紧要",
      "早期原型阶段，过早应用模式会拖慢探索速度"
    ],
    "adopters": [
      "Netflix",
      "Amazon",
      "Google",
      "Microsoft",
      "Apple"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Erich Gamma, Richard Helm, Ralph Johnson, and John Vlissides (1994). \"Design Patterns: Elements of Reusable Object-Oriented Software\". Addison-Wesley.",
    "secondary_sources": [
      "Joshua Kerievsky (2004). \"Refactoring to Patterns\". Addison-Wesley.",
      "Eric Freeman and Elisabeth Robson (2004). \"Head First Design Patterns\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "solid-principles",
        "type": "prerequisite"
      },
      {
        "slug": "grasp-patterns",
        "type": "complement"
      },
      {
        "slug": "hexagonal-architecture",
        "type": "related"
      }
    ]
  },
  {
    "id": 32,
    "name": "Clean Code Principles",
    "name_zh": "整洁代码原则",
    "slug": "clean-code-principles",
    "category": "coding",
    "desc": "Write readable, simple, expressive code that minimizes surprise",
    "desc_zh": "编写可读、简洁、表达力强且最小化意外的代码",
    "steps": [
      "Name with intent: choose variable, function, and class names that reveal purpose; avoid abbreviations and misleading names",
      "Keep functions small and focused: each function should do one thing, have few parameters, and operate at a single level of abstraction",
      "Eliminate duplication: extract shared logic into well-named helper functions; apply the DRY principle without over-abstracting",
      "Write self-documenting code: structure code so it reads like prose; use comments only for 'why' not 'what'",
      "Refactor continuously: apply the Boy Scout Rule — leave code cleaner than you found it with each commit"
    ],
    "steps_zh": [
      "意图命名：选择能揭示目的的变量、函数和类名；避免缩写和误导性名称",
      "保持函数小而专注：每个函数只做一件事，参数少，在单一抽象层级上操作",
      "消除重复：将共享逻辑提取到命名良好的辅助函数中；在不过度抽象的前提下应用DRY原则",
      "编写自文档化代码：组织代码使其如散文般可读；注释只用于解释「为什么」而非「做什么」",
      "持续重构：应用童子军规则——每次提交都让代码比发现时更整洁"
    ],
    "ai_relevant": false,
    "viz_type": "pyramid",
    "viz_labels": [
      "Naming",
      "Functions",
      "Comments",
      "Formatting",
      "Error Handling"
    ],
    "viz_labels_zh": [
      "命名",
      "函数",
      "注释",
      "格式",
      "错误处理"
    ],
    "related": [
      "solid-principles",
      "conventional-comments",
      "grasp-patterns"
    ],
    "tags": [
      "clean-code",
      "readability",
      "naming",
      "refactoring",
      "simplicity"
    ],
    "origin_author": "Robert C. Martin, 2008",
    "origin_source": "Clean Code: A Handbook of Agile Software Craftsmanship",
    "origin_source_zh": "《代码整洁之道》",
    "complexity": "beginner",
    "when_to_use": [
      "Writing any production code that will be read and maintained by others",
      "Onboarding new team members who need to understand existing code quickly",
      "Refactoring legacy code to reduce cognitive load and bug density",
      "Code reviews where readability and maintainability are primary concerns"
    ],
    "when_to_use_zh": [
      "编写任何将被他人阅读和维护的生产代码",
      "新团队成员入职时需要快速理解已有代码",
      "重构遗留代码以降低认知负荷和缺陷密度",
      "以可读性和可维护性为主要关注点的代码评审"
    ],
    "core_concepts": [
      "Meaningful Names: names should reveal intent, avoid disinformation, and be pronounceable and searchable",
      "Small Functions: functions should do one thing, do it well, and do it only — ideally under 20 lines",
      "DRY (Don't Repeat Yourself): every piece of knowledge should have a single, unambiguous representation in the system",
      "Boy Scout Rule: always leave the codebase cleaner than you found it through incremental improvements",
      "Comments as Last Resort: well-written code should be self-explanatory; comments should explain 'why', not 'what'"
    ],
    "core_concepts_zh": [
      "有意义的命名：名称应揭示意图、避免误导，且可发音、可搜索",
      "小函数：函数应只做一件事，做好它，只做这一件——理想情况下不超过 20 行",
      "DRY（不要重复自己）：系统中每条知识都应有唯一、明确的表示",
      "童子军规则：通过渐进式改进，始终让代码库比你发现时更整洁",
      "注释是最后手段：写得好的代码应该是自解释的；注释应解释「为什么」而非「做什么」"
    ],
    "timeline": [
      [
        "1999",
        "The Pragmatic Programmer by Hunt and Thomas introduces DRY and related code quality ideas"
      ],
      [
        "2008",
        "Robert C. Martin publishes 'Clean Code', codifying readability-first principles"
      ],
      [
        "2011",
        "Martin follows up with 'The Clean Coder', extending principles to professional conduct"
      ],
      [
        "2017",
        "Clean Code ideas merge with modern practices like pull request reviews and automated linting"
      ]
    ],
    "timeline_zh": [
      [
        "1999",
        "Hunt 和 Thomas 的《程序员修炼之道》引入 DRY 及相关代码质量理念"
      ],
      [
        "2008",
        "Robert C. Martin 出版《代码整洁之道》，将可读性优先的原则系统化"
      ],
      [
        "2011",
        "Martin 出版后续作品《程序员的职业素养》，将原则扩展到职业行为"
      ],
      [
        "2017",
        "整洁代码理念与拉取请求评审和自动化 lint 等现代实践融合"
      ]
    ],
    "dos": [
      "Do choose descriptive names even if they are longer because clarity beats brevity",
      "Do extract functions at different levels of abstraction because it makes the code read like a narrative",
      "Do write unit tests for every behavior because tests are the best documentation of intent",
      "Do refactor in small steps because large refactors introduce risk and are hard to review"
    ],
    "dos_zh": [
      "即使名称较长也要选择描述性命名，因为清晰胜于简短",
      "在不同抽象层级提取函数，因为这让代码读起来像叙事文",
      "为每个行为编写单元测试，因为测试是意图的最佳文档",
      "以小步骤进行重构，因为大规模重构引入风险且难以评审"
    ],
    "donts": [
      "Don't use magic numbers or strings because they hide intent and create maintenance landmines",
      "Don't write functions with more than three parameters because they are hard to understand and test",
      "Don't leave dead code commented out because version control preserves history",
      "Don't obsess over code aesthetics at the expense of shipping because perfect is the enemy of done"
    ],
    "donts_zh": [
      "不要使用魔法数字或字符串，因为它们隐藏意图并埋下维护隐患",
      "不要编写超过三个参数的函数，因为它们难以理解和测试",
      "不要将死代码注释保留，因为版本控制已保存了历史",
      "不要为追求代码美观而牺牲交付速度，因为完美是完成的敌人"
    ],
    "case_study_company": "Basecamp",
    "case_study": "Basecamp (formerly 37signals) built their entire Ruby on Rails codebase around Clean Code principles. DHH and the team maintained strict naming conventions and small method sizes, which allowed a team of fewer than 20 programmers to build and maintain a product used by millions. Their emphasis on readable code over clever code became a core tenet of the Rails community.",
    "case_study_zh": "Basecamp（前身为 37signals）围绕整洁代码原则构建了整个 Ruby on Rails 代码库。DHH 和团队保持了严格的命名约定和小方法规模，使得不到 20 人的团队能够构建和维护数百万用户使用的产品。他们对可读代码而非聪明代码的强调成为 Rails 社区的核心信条。",
    "when_not_to_use": [
      "Competitive programming or algorithmic contests where execution speed is the only metric",
      "Quick throwaway scripts that will never be maintained or shared",
      "Performance-critical hot paths where micro-optimizations require less readable code",
      "Exploratory data analysis notebooks where iteration speed trumps code structure"
    ],
    "when_not_to_use_zh": [
      "竞技编程或算法竞赛，执行速度是唯一指标",
      "永远不会维护或共享的快速一次性脚本",
      "性能关键的热路径，微优化需要牺牲代码可读性",
      "探索性数据分析笔记本，迭代速度优先于代码结构"
    ],
    "adopters": [
      "Basecamp",
      "Pivotal Labs",
      "Thoughtworks",
      "Shopify",
      "GitHub"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Robert C. Martin (2008). \"Clean Code: A Handbook of Agile Software Craftsmanship\". Prentice Hall.",
    "secondary_sources": [
      "Martin Fowler (2018). \"Refactoring: Improving the Design of Existing Code, 2nd Edition\". Addison-Wesley.",
      "Robert C. Martin (2011). \"The Clean Coder: A Code of Conduct for Professional Programmers\". Prentice Hall."
    ],
    "typed_relations": [
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "conventional-comments",
        "type": "complement"
      },
      {
        "slug": "grasp-patterns",
        "type": "complement"
      }
    ]
  },
  {
    "id": 33,
    "name": "DDD Tactical Patterns",
    "name_zh": "DDD 战术模式",
    "slug": "ddd-tactical-patterns",
    "category": "coding",
    "desc": "Implementation building blocks: entities, value objects, aggregates",
    "desc_zh": "领域驱动设计的实现构建块：实体、值对象、聚合",
    "steps": [
      "Define Entities: identify objects with unique identity that persists across state changes; implement equality by identity, not attributes",
      "Design Value Objects: model concepts defined solely by their attributes (Money, Address); make them immutable and enforce equality by value",
      "Establish Aggregates: group related entities and value objects under an aggregate root that enforces consistency boundaries and invariants",
      "Implement Domain Events: capture meaningful state changes as event objects that can be published to other bounded contexts or projections",
      "Use Repositories and Factories: encapsulate persistence behind repository interfaces and complex object creation behind factory methods"
    ],
    "steps_zh": [
      "定义实体：识别具有跨状态变更持续存在的唯一标识的对象；按标识而非属性实现相等性",
      "设计值对象：对仅由属性定义的概念建模（金额、地址）；使其不可变，按值实现相等性",
      "建立聚合：将相关实体和值对象组织在聚合根下，由聚合根强制一致性边界和不变式",
      "实现领域事件：将有意义的状态变更捕获为事件对象，可发布到其他限界上下文或投影",
      "使用仓储与工厂：将持久化封装在仓储接口后，将复杂对象创建封装在工厂方法后"
    ],
    "ai_relevant": false,
    "viz_type": "tree",
    "viz_labels": [
      "Entity",
      "Value Object",
      "Aggregate",
      "Repository",
      "Domain Event"
    ],
    "viz_labels_zh": [
      "实体",
      "值对象",
      "聚合",
      "资源库",
      "领域事件"
    ],
    "related": [
      "domain-driven-design",
      "event-sourcing-pattern",
      "cqrs-pattern",
      "hexagonal-architecture"
    ],
    "tags": [
      "ddd",
      "entities",
      "value-objects",
      "aggregates",
      "domain-events"
    ],
    "origin_author": "Eric Evans, 2003",
    "origin_source": "Domain-Driven Design: Tackling Complexity in the Heart of Software",
    "origin_source_zh": "《领域驱动设计：软件核心复杂性应对之道》",
    "complexity": "advanced",
    "when_to_use": [
      "Building complex business domains with intricate rules and workflows",
      "Systems where the domain model is the primary source of competitive advantage",
      "Projects with domain experts actively collaborating with developers",
      "Microservice boundaries need to enforce strong consistency within aggregates"
    ],
    "when_to_use_zh": [
      "构建具有复杂规则和工作流的业务领域",
      "领域模型是核心竞争优势来源的系统",
      "领域专家积极与开发者协作的项目",
      "微服务边界需要在聚合内强制严格一致性"
    ],
    "core_concepts": [
      "Entity: an object defined by its identity rather than its attributes, whose lifecycle is tracked across state changes",
      "Value Object: an immutable object defined entirely by its attributes, with no conceptual identity (e.g., Money, DateRange)",
      "Aggregate: a cluster of entities and value objects treated as a single unit for data changes, with an Aggregate Root as the entry point",
      "Domain Event: a record of something meaningful that happened in the domain, used to communicate between bounded contexts",
      "Repository: an abstraction that mediates between the domain and data mapping layers, providing collection-like access to aggregates"
    ],
    "core_concepts_zh": [
      "实体：由其标识而非属性定义的对象，其生命周期跨越状态变更被持续追踪",
      "值对象：完全由属性定义的不可变对象，没有概念上的标识（如金额、日期范围）",
      "聚合：作为数据变更单元的实体和值对象集群，以聚合根作为入口点",
      "领域事件：领域中发生的有意义事情的记录，用于限界上下文之间的通信",
      "仓储：在领域层和数据映射层之间的抽象中介，提供类集合式的聚合访问"
    ],
    "timeline": [
      [
        "2003",
        "Eric Evans publishes 'Domain-Driven Design', introducing tactical and strategic patterns"
      ],
      [
        "2013",
        "Vaughn Vernon publishes 'Implementing Domain-Driven Design', providing practical implementation guidance"
      ],
      [
        "2016",
        "DDD tactical patterns gain traction in microservice architectures as natural aggregate-to-service mapping"
      ],
      [
        "2019",
        "Event-sourced aggregates become mainstream with tools like EventStoreDB and Axon Framework"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "Eric Evans 出版《领域驱动设计》，引入战术和战略模式"
      ],
      [
        "2013",
        "Vaughn Vernon 出版《实现领域驱动设计》，提供实践性实施指导"
      ],
      [
        "2016",
        "DDD 战术模式在微服务架构中获得广泛应用，聚合自然映射到服务"
      ],
      [
        "2019",
        "事件溯源聚合随着 EventStoreDB 和 Axon Framework 等工具成为主流"
      ]
    ],
    "dos": [
      "Do keep aggregates small because large aggregates create contention and performance problems",
      "Do make value objects immutable because mutability introduces subtle bugs in equality and hashing",
      "Do use ubiquitous language in code because domain experts should be able to read your class names",
      "Do enforce invariants at the aggregate root because consistency boundaries must be explicit"
    ],
    "dos_zh": [
      "保持聚合小巧，因为大聚合会造成竞争和性能问题",
      "使值对象不可变，因为可变性会在相等性和哈希中引入微妙的缺陷",
      "在代码中使用通用语言，因为领域专家应能读懂你的类名",
      "在聚合根处强制不变式，因为一致性边界必须是显式的"
    ],
    "donts": [
      "Don't reference other aggregates by direct object reference because it breaks consistency boundaries — use IDs instead",
      "Don't create anemic domain models that are just data bags with external service logic because it defeats the purpose of DDD",
      "Don't skip event modeling because domain events are the primary mechanism for cross-aggregate communication",
      "Don't apply DDD tactical patterns to simple CRUD domains because the overhead is not justified"
    ],
    "donts_zh": [
      "不要通过直接对象引用关联其他聚合，因为这破坏了一致性边界——应使用 ID",
      "不要创建仅是数据袋配外部服务逻辑的贫血领域模型，因为这违背了 DDD 的初衷",
      "不要跳过事件建模，因为领域事件是跨聚合通信的主要机制",
      "不要对简单的 CRUD 领域应用 DDD 战术模式，因为其开销不值得"
    ],
    "case_study_company": "Zalando",
    "case_study": "Zalando, Europe's largest online fashion platform, restructured its order management system using DDD tactical patterns. Each bounded context (inventory, payment, shipping) was modeled with distinct aggregates, and domain events enabled eventual consistency across contexts. This allowed Zalando to scale to processing over 500,000 orders per day with independent team deployments.",
    "case_study_zh": "欧洲最大的在线时尚平台 Zalando 使用 DDD 战术模式重构了其订单管理系统。每个限界上下文（库存、支付、物流）都用独立的聚合建模，领域事件实现了跨上下文的最终一致性。这使 Zalando 能够扩展到每天处理超过 50 万笔订单，同时各团队独立部署。",
    "when_not_to_use": [
      "Simple CRUD applications with minimal business logic",
      "Projects without access to domain experts for collaborative modeling",
      "Teams unfamiliar with DDD who would misapply patterns and create accidental complexity",
      "Read-heavy reporting systems where a query-optimized model is more appropriate"
    ],
    "when_not_to_use_zh": [
      "业务逻辑极少的简单 CRUD 应用",
      "无法接触领域专家进行协作建模的项目",
      "不熟悉 DDD 的团队，会误用模式并造成意外复杂性",
      "以读为主的报表系统，查询优化模型更合适"
    ],
    "adopters": [
      "Zalando",
      "SoundCloud",
      "Klarna",
      "Just Eat Takeaway",
      "Vrbo"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Eric Evans (2003). \"Domain-Driven Design: Tackling Complexity in the Heart of Software\". Addison-Wesley.",
    "secondary_sources": [
      "Vaughn Vernon (2013). \"Implementing Domain-Driven Design\". Addison-Wesley.",
      "Scott Millett and Nick Tune (2015). \"Patterns, Principles, and Practices of Domain-Driven Design\". Wrox."
    ],
    "typed_relations": [
      {
        "slug": "domain-driven-design",
        "type": "extends"
      },
      {
        "slug": "event-sourcing-pattern",
        "type": "complement"
      },
      {
        "slug": "cqrs-pattern",
        "type": "complement"
      },
      {
        "slug": "hexagonal-architecture",
        "type": "complement"
      }
    ]
  },
  {
    "id": 34,
    "name": "Hexagonal Architecture",
    "name_zh": "六边形架构",
    "slug": "hexagonal-architecture",
    "category": "coding",
    "desc": "Isolate core logic from external concerns via ports and adapters",
    "desc_zh": "通过端口与适配器将核心逻辑与外部关注点隔离",
    "steps": [
      "Define the Domain Core: implement pure business logic with no dependencies on frameworks, databases, or external services",
      "Design Ports: create inbound ports (use case interfaces the outside world calls) and outbound ports (interfaces the domain needs from the outside)",
      "Implement Adapters: build concrete adapters for each port — REST controllers for inbound, repository implementations for outbound",
      "Wire at the Composition Root: connect adapters to ports via dependency injection at application startup, keeping the domain ignorant of infrastructure",
      "Test at each layer: unit-test the domain through ports with in-memory adapters; integration-test adapters against real infrastructure"
    ],
    "steps_zh": [
      "定义领域核心：实现不依赖框架、数据库或外部服务的纯业务逻辑",
      "设计端口：创建入站端口（外部世界调用的用例接口）和出站端口（领域需要的外部接口）",
      "实现适配器：为每个端口构建具体适配器——入站用REST控制器，出站用仓储实现",
      "在组合根装配：在应用启动时通过依赖注入连接适配器与端口，保持领域对基础设施无感知",
      "分层测试：通过端口使用内存适配器对领域进行单元测试；对适配器进行集成测试"
    ],
    "ai_relevant": true,
    "viz_type": "venn",
    "viz_labels": [
      "Application Core",
      "Driving Port",
      "Driven Port"
    ],
    "viz_labels_zh": [
      "应用核心",
      "主动端口",
      "被动端口"
    ],
    "related": [
      "solid-principles",
      "dependency-injection",
      "ddd-tactical-patterns",
      "clean-code-principles",
      "clean-architecture",
      "onion-architecture",
      "n-tier-layered",
      "ports-and-adapters"
    ],
    "tags": [
      "hexagonal",
      "ports-adapters",
      "clean-architecture",
      "isolation",
      "testability"
    ],
    "origin_author": "Alistair Cockburn, 2005",
    "origin_source": "Hexagonal Architecture (Ports and Adapters)",
    "origin_source_zh": "《六边形架构（端口与适配器）》",
    "complexity": "advanced",
    "when_to_use": [
      "Building applications that need to support multiple delivery mechanisms (REST, CLI, gRPC, messaging)",
      "Systems where infrastructure decisions (database, cloud provider) may change over time",
      "Projects requiring high testability with fast unit tests that don't need infrastructure",
      "Teams adopting DDD where domain isolation from infrastructure is a priority"
    ],
    "when_to_use_zh": [
      "构建需要支持多种交付机制（REST、CLI、gRPC、消息队列）的应用",
      "基础设施决策（数据库、云提供商）可能随时间变化的系统",
      "要求高可测试性、快速单元测试不依赖基础设施的项目",
      "采用 DDD 且领域与基础设施隔离是优先级的团队"
    ],
    "core_concepts": [
      "Domain Core: the innermost layer containing pure business logic with zero external dependencies",
      "Ports: interfaces that define how the outside world interacts with the domain (inbound) and how the domain reaches external systems (outbound)",
      "Adapters: concrete implementations that plug into ports, translating between external technology and domain contracts",
      "Composition Root: the single place (typically application startup) where all ports are wired to their adapters via dependency injection",
      "Dependency Rule: dependencies always point inward — adapters depend on ports, never the reverse"
    ],
    "core_concepts_zh": [
      "领域核心：最内层，包含零外部依赖的纯业务逻辑",
      "端口：定义外部世界如何与领域交互（入站）以及领域如何访问外部系统（出站）的接口",
      "适配器：插入端口的具体实现，在外部技术与领域契约之间进行翻译",
      "组合根：所有端口通过依赖注入连接到适配器的唯一位置（通常是应用启动处）",
      "依赖规则：依赖方向始终指向内部——适配器依赖端口，绝不反过来"
    ],
    "timeline": [
      [
        "2005",
        "Alistair Cockburn publishes the Hexagonal Architecture pattern on his wiki"
      ],
      [
        "2008",
        "Jeffrey Palermo introduces Onion Architecture, a close variant with explicit layer rings"
      ],
      [
        "2012",
        "Robert C. Martin presents Clean Architecture, synthesizing hexagonal, onion, and DCI ideas"
      ],
      [
        "2017",
        "Netflix and Spotify publicly share their hexagonal architecture implementations for microservices"
      ]
    ],
    "timeline_zh": [
      [
        "2005",
        "Alistair Cockburn 在其 wiki 上发布六边形架构模式"
      ],
      [
        "2008",
        "Jeffrey Palermo 提出洋葱架构，一种具有显式分层环的近似变体"
      ],
      [
        "2012",
        "Robert C. Martin 提出整洁架构，综合了六边形、洋葱和 DCI 的思想"
      ],
      [
        "2017",
        "Netflix 和 Spotify 公开分享了他们微服务的六边形架构实现"
      ]
    ],
    "dos": [
      "Do define ports as interfaces in the domain layer because they represent domain needs, not infrastructure capabilities",
      "Do keep the domain core free of framework annotations because annotations create hidden dependencies",
      "Do create separate adapter modules for each external system because it isolates change impact",
      "Do write domain tests using in-memory adapters because they run in milliseconds and catch logic bugs"
    ],
    "dos_zh": [
      "在领域层定义端口为接口，因为它们代表领域需求而非基础设施能力",
      "保持领域核心不含框架注解，因为注解会创建隐式依赖",
      "为每个外部系统创建独立的适配器模块，因为这隔离了变更影响",
      "使用内存适配器编写领域测试，因为它们在毫秒内运行并能捕获逻辑缺陷"
    ],
    "donts": [
      "Don't let domain objects import infrastructure packages because it violates the dependency rule",
      "Don't create one mega-adapter that handles multiple external systems because it defeats the purpose of isolation",
      "Don't skip the port abstraction by having domain code call adapters directly because it reintroduces coupling",
      "Don't over-architect simple CRUD endpoints with hexagonal layers because the indirection adds no value"
    ],
    "donts_zh": [
      "不要让领域对象导入基础设施包，因为这违反了依赖规则",
      "不要创建处理多个外部系统的超级适配器，因为这违背了隔离的初衷",
      "不要跳过端口抽象让领域代码直接调用适配器，因为这重新引入了耦合",
      "不要对简单的 CRUD 端点过度应用六边形分层，因为间接层不带来价值"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix adopted hexagonal architecture for its content encoding pipeline. The domain core handles transcoding decisions and quality rules, while adapters connect to different cloud storage backends (S3, Google Cloud Storage) and encoding engines. When Netflix migrated parts of its infrastructure, only adapters changed — the core encoding logic remained untouched, saving months of re-testing.",
    "case_study_zh": "Netflix 为其内容编码管道采用了六边形架构。领域核心处理转码决策和质量规则，而适配器连接不同的云存储后端（S3、Google Cloud Storage）和编码引擎。当 Netflix 迁移部分基础设施时，只有适配器发生变化——核心编码逻辑保持不变，节省了数月的重新测试时间。",
    "when_not_to_use": [
      "Trivial applications with a single database and one delivery mechanism",
      "Serverless functions where each function is a single-purpose handler with minimal logic",
      "Teams that lack experience with dependency injection and interface-based design",
      "Performance-critical systems where the port/adapter indirection introduces unacceptable latency"
    ],
    "when_not_to_use_zh": [
      "只有单个数据库和一种交付机制的简单应用",
      "每个函数都是单用途处理程序且逻辑极少的无服务器函数",
      "缺乏依赖注入和基于接口设计经验的团队",
      "端口/适配器间接层引入不可接受延迟的性能关键系统"
    ],
    "adopters": [
      "Netflix",
      "Spotify",
      "Mercado Libre",
      "BBVA",
      "Thoughtworks"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "testability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Alistair Cockburn (2005). \"Hexagonal Architecture (Ports and Adapters)\". alistair.cockburn.us.",
    "secondary_sources": [
      "Robert C. Martin (2017). \"Clean Architecture: A Craftsman's Guide to Software Structure and Design\". Prentice Hall.",
      "Tom Hombergs (2019). \"Get Your Hands Dirty on Clean Architecture\". Packt Publishing."
    ],
    "typed_relations": [
      {
        "slug": "solid-principles",
        "type": "prerequisite"
      },
      {
        "slug": "dependency-injection",
        "type": "complement"
      },
      {
        "slug": "ddd-tactical-patterns",
        "type": "complement"
      },
      {
        "slug": "clean-code-principles",
        "type": "complement"
      },
      {
        "slug": "clean-architecture",
        "type": "alternative"
      },
      {
        "slug": "onion-architecture",
        "type": "alternative"
      },
      {
        "slug": "n-tier-layered",
        "type": "alternative"
      },
      {
        "slug": "ports-and-adapters",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 35,
    "name": "Functional Core / Imperative Shell",
    "name_zh": "函数式核心 / 命令式外壳",
    "slug": "functional-core-imperative-shell",
    "category": "coding",
    "desc": "Pure logic in the center, side effects only at the boundaries",
    "desc_zh": "纯逻辑居于核心，副作用仅存在于边界层",
    "steps": [
      "Identify side effects: catalog all I/O, database calls, network requests, and mutations in the current codebase",
      "Extract pure functions: move all decision-making and transformation logic into pure functions with no side effects",
      "Design the imperative shell: create a thin outer layer that reads input, calls the pure core, and writes output",
      "Pass data, not dependencies: the pure core receives plain data and returns plain data — no injected services or callbacks",
      "Test the core exhaustively: unit-test pure functions with simple input/output assertions; integration-test only the shell"
    ],
    "steps_zh": [
      "识别副作用：编目当前代码库中所有I/O、数据库调用、网络请求和状态变更",
      "提取纯函数：将所有决策和转换逻辑移入无副作用的纯函数中",
      "设计命令式外壳：创建薄外层，负责读取输入、调用纯核心并写出结果",
      "传递数据而非依赖：纯核心接收普通数据并返回普通数据——不注入服务或回调",
      "穷尽测试核心：用简单的输入输出断言对纯函数做单元测试；仅对外壳做集成测试"
    ],
    "ai_relevant": false,
    "viz_type": "venn",
    "viz_labels": [
      "Functional Core",
      "Imperative Shell",
      "Pure Logic"
    ],
    "viz_labels_zh": [
      "函数核心",
      "命令式外壳",
      "纯函数逻辑"
    ],
    "related": [
      "hexagonal-architecture",
      "clean-code-principles",
      "property-based-testing"
    ],
    "tags": [
      "functional-programming",
      "pure-functions",
      "side-effects",
      "testability",
      "architecture"
    ],
    "origin_author": "Gary Bernhardt, 2012",
    "origin_source": "Boundaries (talk at RubyConf and Destroy All Software screencasts)",
    "origin_source_zh": "《Boundaries》（RubyConf 演讲及 Destroy All Software 系列视频）",
    "complexity": "intermediate",
    "when_to_use": [
      "Applications with complex business logic that needs thorough testing without infrastructure dependencies",
      "Codebases with tangled side effects that make unit testing slow and flaky",
      "Multi-language systems where the pure core can be shared or ported across platforms",
      "Data pipelines where transformation logic should be independent of I/O mechanisms"
    ],
    "when_to_use_zh": [
      "业务逻辑复杂、需要彻底测试而不依赖基础设施的应用",
      "副作用交织导致单元测试缓慢且不稳定的代码库",
      "纯核心可在平台间共享或移植的多语言系统",
      "转换逻辑应独立于 I/O 机制的数据管道"
    ],
    "core_concepts": [
      "Pure Functions: functions that always return the same output for the same input and produce no side effects",
      "Imperative Shell: the thin boundary layer that performs all I/O, state mutations, and external communications",
      "Data In, Data Out: the pure core communicates through plain data structures, not interfaces or service objects",
      "Testability by Design: pure functions can be tested with simple assertions; no mocks, stubs, or test doubles needed",
      "Push Effects to the Edges: move all side-effectful code as far outward as possible so the core remains pure"
    ],
    "core_concepts_zh": [
      "纯函数：对相同输入始终返回相同输出、不产生副作用的函数",
      "命令式外壳：执行所有 I/O、状态变更和外部通信的薄边界层",
      "数据进，数据出：纯核心通过普通数据结构通信，而非接口或服务对象",
      "设计即可测试：纯函数可用简单断言测试；不需要 mock、stub 或测试替身",
      "将副作用推向边缘：将所有副作用代码尽可能推向外层，保持核心纯净"
    ],
    "timeline": [
      [
        "2012",
        "Gary Bernhardt presents 'Boundaries' at RubyConf, articulating the Functional Core / Imperative Shell pattern"
      ],
      [
        "2013",
        "The pattern gains traction in the Ruby and Elixir communities through Destroy All Software screencasts"
      ],
      [
        "2016",
        "Redux architecture in React popularizes a similar pattern: pure reducers (core) with side-effect middleware (shell)"
      ],
      [
        "2020",
        "The pattern becomes standard in serverless architectures where pure business logic is separated from handler wiring"
      ]
    ],
    "timeline_zh": [
      [
        "2012",
        "Gary Bernhardt 在 RubyConf 发表《Boundaries》演讲，阐述函数式核心/命令式外壳模式"
      ],
      [
        "2013",
        "该模式通过 Destroy All Software 系列视频在 Ruby 和 Elixir 社区中获得关注"
      ],
      [
        "2016",
        "React 中的 Redux 架构推广了类似模式：纯 reducer（核心）配合副作用中间件（外壳）"
      ],
      [
        "2020",
        "该模式成为无服务器架构的标准做法，纯业务逻辑与处理程序装配分离"
      ]
    ],
    "dos": [
      "Do pass all needed data as function arguments because implicit state access breaks purity",
      "Do return new data structures from pure functions instead of mutating inputs because immutability prevents bugs",
      "Do keep the shell as thin as possible because the less imperative code you have, the fewer integration tests you need",
      "Do use value objects to pass structured data between shell and core because they enforce type safety"
    ],
    "dos_zh": [
      "将所有所需数据作为函数参数传递，因为隐式状态访问破坏纯净性",
      "从纯函数返回新数据结构而非修改输入，因为不可变性防止缺陷",
      "保持外壳尽可能薄，因为命令式代码越少，需要的集成测试就越少",
      "使用值对象在外壳和核心之间传递结构化数据，因为它们强制类型安全"
    ],
    "donts": [
      "Don't inject services or repositories into the pure core because it reintroduces side-effect dependencies",
      "Don't perform logging or metrics collection inside pure functions because those are side effects",
      "Don't let the shell grow to contain business logic because it should only orchestrate I/O and delegation",
      "Don't test pure functions with mocks because they should be testable with plain input/output assertions"
    ],
    "donts_zh": [
      "不要向纯核心注入服务或仓储，因为这重新引入了副作用依赖",
      "不要在纯函数内执行日志记录或指标收集，因为这些是副作用",
      "不要让外壳膨胀到包含业务逻辑，因为它应仅编排 I/O 和委托",
      "不要用 mock 测试纯函数，因为它们应该可以用简单的输入/输出断言来测试"
    ],
    "case_study_company": "Shopify",
    "case_study": "Shopify's checkout system uses the Functional Core / Imperative Shell pattern in its Ruby codebase. All pricing, discount, and tax calculation logic lives in pure functions that receive cart data and return pricing results. The imperative shell handles payment gateway calls and database persistence. This separation enabled Shopify to achieve sub-100ms test suites for pricing logic and confidently handle Black Friday traffic spikes.",
    "case_study_zh": "Shopify 的结账系统在其 Ruby 代码库中使用了函数式核心/命令式外壳模式。所有定价、折扣和税费计算逻辑都在纯函数中，接收购物车数据并返回定价结果。命令式外壳处理支付网关调用和数据库持久化。这种分离使 Shopify 的定价逻辑测试套件达到了低于 100 毫秒的执行速度，并能自信地应对黑色星期五的流量高峰。",
    "when_not_to_use": [
      "Highly interactive UI applications where state management is inherently stateful",
      "Systems dominated by I/O with very little business logic (e.g., proxy servers)",
      "Legacy codebases where extracting pure functions would require a massive rewrite",
      "Real-time systems where the overhead of data copying affects latency requirements"
    ],
    "when_not_to_use_zh": [
      "高度交互式的 UI 应用，状态管理本质上是有状态的",
      "以 I/O 为主、业务逻辑极少的系统（如代理服务器）",
      "提取纯函数需要大规模重写的遗留代码库",
      "数据拷贝开销影响延迟要求的实时系统"
    ],
    "adopters": [
      "Shopify",
      "Stripe",
      "GitHub",
      "Jane Street",
      "Nubank"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "testability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Gary Bernhardt (2012). \"Boundaries\". Destroy All Software Screencasts / RubyConf Talk.",
    "secondary_sources": [
      "Scott Wlaschin (2018). \"Domain Modeling Made Functional\". Pragmatic Bookshelf.",
      "Mark Seemann (2021). \"Code That Fits in Your Head: Heuristics for Software Engineering\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "hexagonal-architecture",
        "type": "complement"
      },
      {
        "slug": "clean-code-principles",
        "type": "complement"
      },
      {
        "slug": "property-based-testing",
        "type": "complement"
      }
    ]
  },
  {
    "id": 36,
    "name": "Reactive Extensions (Rx)",
    "name_zh": "响应式扩展（Rx）",
    "slug": "reactive-extensions",
    "category": "coding",
    "desc": "Compose async event streams with observable sequences and operators",
    "desc_zh": "通过可观察序列和操作符组合异步事件流",
    "steps": [
      "Model data as streams: represent events, user actions, and async results as Observable sequences rather than callbacks or promises",
      "Apply operators: use map, filter, merge, switchMap, debounce, and other operators to transform and combine streams declaratively",
      "Manage subscriptions: subscribe to observables at the UI or service boundary and unsubscribe on component destruction to prevent memory leaks",
      "Handle errors in the stream: use catchError, retry, and fallback operators to build resilient pipelines without try/catch nesting",
      "Test with marble diagrams: use virtual time schedulers and marble syntax to write deterministic tests for complex async flows"
    ],
    "steps_zh": [
      "将数据建模为流：将事件、用户动作和异步结果表示为Observable序列，而非回调或Promise",
      "应用操作符：使用map、filter、merge、switchMap、debounce等操作符声明式地转换和组合流",
      "管理订阅：在UI或服务边界订阅Observable，在组件销毁时取消订阅以防止内存泄漏",
      "在流中处理错误：使用catchError、retry和降级操作符构建弹性管道，避免try/catch嵌套",
      "用弹珠图测试：使用虚拟时间调度器和弹珠语法为复杂异步流编写确定性测试"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Observable",
      "Operator",
      "Subscriber",
      "Scheduler"
    ],
    "viz_labels_zh": [
      "可观察序列",
      "操作符",
      "订阅者",
      "调度器"
    ],
    "related": [
      "eda",
      "actor-model",
      "functional-core-imperative-shell"
    ],
    "tags": [
      "reactive",
      "observables",
      "streams",
      "async",
      "operators"
    ],
    "origin_author": "Erik Meijer, 2009",
    "origin_source": "Reactive Extensions for .NET (Rx.NET), Microsoft DevLabs",
    "origin_source_zh": "《.NET 响应式扩展（Rx.NET）》，微软开发实验室",
    "complexity": "advanced",
    "when_to_use": [
      "Building real-time UIs that react to multiple asynchronous data sources (WebSocket, user input, timers)",
      "Implementing complex event processing pipelines with backpressure and error recovery",
      "Mobile applications where lifecycle-aware subscription management is critical",
      "Systems that need to compose, merge, and throttle multiple event streams declaratively"
    ],
    "when_to_use_zh": [
      "构建响应多个异步数据源（WebSocket、用户输入、定时器）的实时 UI",
      "实现具有背压和错误恢复的复杂事件处理管道",
      "生命周期感知的订阅管理至关重要的移动应用",
      "需要声明式地组合、合并和限流多个事件流的系统"
    ],
    "core_concepts": [
      "Observable: a push-based collection that emits items over time, representing an asynchronous data stream",
      "Observer/Subscriber: a consumer that reacts to items emitted by an Observable via onNext, onError, and onComplete callbacks",
      "Operators: composable functions (map, filter, flatMap, debounce, zip) that transform and combine Observable streams",
      "Schedulers: control which thread or execution context an Observable emits on and an Observer subscribes on",
      "Backpressure: a mechanism for handling situations where an Observable produces items faster than an Observer can consume them"
    ],
    "core_concepts_zh": [
      "Observable：基于推送的集合，随时间发射元素，表示异步数据流",
      "Observer/Subscriber：通过 onNext、onError 和 onComplete 回调响应 Observable 发射元素的消费者",
      "操作符：可组合的函数（map、filter、flatMap、debounce、zip），用于转换和组合 Observable 流",
      "调度器：控制 Observable 在哪个线程或执行上下文上发射以及 Observer 在哪里订阅",
      "背压：处理 Observable 产生元素速度快于 Observer 消费速度的机制"
    ],
    "timeline": [
      [
        "2009",
        "Erik Meijer and team release Rx.NET at Microsoft, introducing the Observable/Observer pattern for async streams"
      ],
      [
        "2013",
        "RxJava is released by Netflix for JVM-based reactive programming in their microservice architecture"
      ],
      [
        "2015",
        "The Reactive Streams specification is published, standardizing backpressure across libraries"
      ],
      [
        "2018",
        "RxJS becomes integral to Angular framework, bringing Rx to mainstream frontend development"
      ]
    ],
    "timeline_zh": [
      [
        "2009",
        "Erik Meijer 团队在微软发布 Rx.NET，引入用于异步流的 Observable/Observer 模式"
      ],
      [
        "2013",
        "Netflix 发布 RxJava，用于其微服务架构中的 JVM 响应式编程"
      ],
      [
        "2015",
        "Reactive Streams 规范发布，在各库间标准化背压处理"
      ],
      [
        "2018",
        "RxJS 成为 Angular 框架的核心组件，将 Rx 带入主流前端开发"
      ]
    ],
    "dos": [
      "Do use marble diagrams for documentation and testing because they visually clarify complex timing behavior",
      "Do unsubscribe or use takeUntil patterns because leaked subscriptions are the most common Rx bug",
      "Do choose the right operator for the job because using flatMap when you need switchMap causes subtle bugs",
      "Do handle errors at the stream level because an unhandled error terminates the entire Observable chain"
    ],
    "dos_zh": [
      "使用弹珠图进行文档编写和测试，因为它们能直观地说明复杂的时序行为",
      "取消订阅或使用 takeUntil 模式，因为泄漏的订阅是最常见的 Rx 缺陷",
      "为任务选择正确的操作符，因为在需要 switchMap 时使用 flatMap 会导致微妙的缺陷",
      "在流级别处理错误，因为未处理的错误会终止整个 Observable 链"
    ],
    "donts": [
      "Don't nest subscribe calls because it defeats the purpose of declarative stream composition",
      "Don't use Rx for simple one-shot async operations because Promises/async-await are simpler and sufficient",
      "Don't ignore backpressure in high-throughput scenarios because uncontrolled buffering causes out-of-memory errors",
      "Don't create hot observables without understanding multicasting because each subscription may trigger a new execution"
    ],
    "donts_zh": [
      "不要嵌套 subscribe 调用，因为这违背了声明式流组合的初衷",
      "不要对简单的一次性异步操作使用 Rx，因为 Promise/async-await 更简单且足够",
      "不要在高吞吐场景中忽略背压，因为不受控的缓冲会导致内存溢出",
      "不要在不理解多播的情况下创建热 Observable，因为每次订阅可能触发新的执行"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix created RxJava to manage the complexity of composing dozens of asynchronous microservice calls in their API layer. By modeling each service call as an Observable and using operators like zip and merge, they could assemble personalized page responses from parallel backend calls. This reactive approach reduced API response latency by 30% and simplified error handling across hundreds of microservices.",
    "case_study_zh": "Netflix 创建了 RxJava 来管理其 API 层中组合数十个异步微服务调用的复杂性。通过将每个服务调用建模为 Observable 并使用 zip 和 merge 等操作符，他们能从并行的后端调用中组装个性化页面响应。这种响应式方法将 API 响应延迟降低了 30%，并简化了数百个微服务的错误处理。",
    "when_not_to_use": [
      "Simple request-response APIs where Promises or async/await are sufficient",
      "Teams unfamiliar with functional reactive programming who would struggle with the learning curve",
      "Applications with very few asynchronous operations where Rx adds unnecessary complexity",
      "Batch processing systems where pull-based iteration is more appropriate than push-based streams"
    ],
    "when_not_to_use_zh": [
      "Promise 或 async/await 已足够的简单请求-响应 API",
      "不熟悉函数式响应式编程、难以应对学习曲线的团队",
      "异步操作极少的应用，Rx 会增加不必要的复杂性",
      "基于拉取的迭代比基于推送的流更合适的批处理系统"
    ],
    "adopters": [
      "Netflix",
      "Microsoft",
      "Google (Angular)",
      "Trello",
      "SoundCloud"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "performance",
      "scalability"
    ],
    "maturity_ring": "established",
    "primary_source": "Erik Meijer (2010). \"Reactive Extensions (Rx): Curing Your Asynchronous Programming Blues\". Microsoft DevLabs.",
    "secondary_sources": [
      "Jonas Boner et al. (2014). \"The Reactive Manifesto\". reactivemanifesto.org.",
      "Tomas Petricek (2016). \"Real-World Functional Programming\". Manning Publications."
    ],
    "typed_relations": [
      {
        "slug": "eda",
        "type": "complement"
      },
      {
        "slug": "actor-model",
        "type": "alternative"
      },
      {
        "slug": "functional-core-imperative-shell",
        "type": "complement"
      }
    ]
  },
  {
    "id": 37,
    "name": "Richardson Maturity Model",
    "name_zh": "Richardson 成熟度模型",
    "slug": "richardson-maturity-model",
    "category": "coding",
    "desc": "Four levels of REST API maturity from RPC to hypermedia",
    "desc_zh": "REST API 从 RPC 到超媒体的四级成熟度模型",
    "steps": [
      "Assess current level: determine if the API is at Level 0 (single URI, single verb), Level 1 (resources), Level 2 (HTTP verbs), or Level 3 (hypermedia)",
      "Introduce proper resources (Level 1): model each domain concept as a unique URI (e.g., /orders/123) instead of action-based endpoints",
      "Use HTTP verbs correctly (Level 2): map CRUD operations to GET, POST, PUT, DELETE with proper status codes and idempotency semantics",
      "Add hypermedia controls (Level 3): include links in responses (HATEOAS) so clients discover available actions dynamically",
      "Validate with consumer tests: use contract tests to ensure API responses match the expected level of maturity and remain backward-compatible"
    ],
    "steps_zh": [
      "评估当前级别：判断API处于0级（单URI单动词）、1级（资源）、2级（HTTP动词）还是3级（超媒体）",
      "引入合理资源（1级）：将每个领域概念建模为唯一URI（如/orders/123），而非基于动作的端点",
      "正确使用HTTP动词（2级）：将CRUD操作映射到GET、POST、PUT、DELETE，配合正确的状态码和幂等语义",
      "添加超媒体控制（3级）：在响应中包含链接（HATEOAS），使客户端动态发现可用操作",
      "通过消费者测试验证：使用契约测试确保API响应符合预期的成熟度级别并保持向后兼容"
    ],
    "ai_relevant": false,
    "viz_type": "pyramid",
    "viz_labels": [
      "Level 0",
      "Level 1 (Resources)",
      "Level 2 (HTTP Verbs)",
      "Level 3 (HATEOAS)"
    ],
    "viz_labels_zh": [
      "L0单端点",
      "L1资源",
      "L2 HTTP动词",
      "L3超媒体"
    ],
    "related": [
      "api-versioning-strategies",
      "contract-testing",
      "ai-first-api-design"
    ],
    "tags": [
      "rest",
      "api-design",
      "maturity-model",
      "hateoas",
      "http"
    ],
    "origin_author": "Leonard Richardson, 2008",
    "origin_source": "Justice Will Take Us Millions of Intricate Moves (QCon talk, later popularized by Martin Fowler)",
    "origin_source_zh": "QCon 演讲，后由 Martin Fowler 撰文推广",
    "complexity": "intermediate",
    "when_to_use": [
      "Evaluating the design quality of existing REST APIs during architecture reviews",
      "Planning API modernization roadmaps from RPC-style to RESTful design",
      "Teaching teams about REST best practices with a clear progression model",
      "Deciding the appropriate level of REST maturity for a new API based on client needs"
    ],
    "when_to_use_zh": [
      "在架构评审中评估现有 REST API 的设计质量",
      "规划从 RPC 风格到 RESTful 设计的 API 现代化路线图",
      "用清晰的渐进模型向团队传授 REST 最佳实践",
      "根据客户端需求决定新 API 的合适 REST 成熟度级别"
    ],
    "core_concepts": [
      "Level 0 - The Swamp of POX: one URI, one HTTP method (usually POST), essentially RPC over HTTP",
      "Level 1 - Resources: individual URIs for each resource, but still using a single HTTP method for all operations",
      "Level 2 - HTTP Verbs: proper use of GET, POST, PUT, DELETE with correct status codes and idempotency",
      "Level 3 - Hypermedia Controls (HATEOAS): responses include links that guide clients to available next actions",
      "Progressive Enhancement: each level builds on the previous, and most practical APIs target Level 2"
    ],
    "core_concepts_zh": [
      "第 0 级 - POX 沼泽：一个 URI、一个 HTTP 方法（通常是 POST），本质上是基于 HTTP 的 RPC",
      "第 1 级 - 资源：每个资源有独立的 URI，但仍对所有操作使用单一 HTTP 方法",
      "第 2 级 - HTTP 动词：正确使用 GET、POST、PUT、DELETE，配合正确的状态码和幂等性",
      "第 3 级 - 超媒体控制（HATEOAS）：响应中包含链接，引导客户端发现可用的后续操作",
      "渐进增强：每一级在前一级基础上构建，大多数实用 API 以第 2 级为目标"
    ],
    "timeline": [
      [
        "2000",
        "Roy Fielding defines REST in his doctoral dissertation at UC Irvine"
      ],
      [
        "2008",
        "Leonard Richardson presents the maturity model at QCon San Francisco"
      ],
      [
        "2010",
        "Martin Fowler publishes his influential blog post explaining the Richardson Maturity Model"
      ],
      [
        "2015",
        "Level 2 REST becomes the de facto standard for public APIs; Level 3 HATEOAS remains niche"
      ]
    ],
    "timeline_zh": [
      [
        "2000",
        "Roy Fielding 在加州大学尔湾分校的博士论文中定义 REST"
      ],
      [
        "2008",
        "Leonard Richardson 在 QCon 旧金山大会上提出成熟度模型"
      ],
      [
        "2010",
        "Martin Fowler 发表有影响力的博客文章阐述 Richardson 成熟度模型"
      ],
      [
        "2015",
        "第 2 级 REST 成为公共 API 的事实标准；第 3 级 HATEOAS 仍属小众"
      ]
    ],
    "dos": [
      "Do use proper HTTP status codes because they convey semantics that clients and proxies understand",
      "Do make GET requests safe and idempotent because caching and retries depend on this guarantee",
      "Do version your API alongside maturity improvements because breaking changes need controlled rollout",
      "Do document your API's maturity level because it sets clear expectations for consumers"
    ],
    "dos_zh": [
      "使用正确的 HTTP 状态码，因为它们传达客户端和代理都理解的语义",
      "使 GET 请求安全且幂等，因为缓存和重试依赖于此保证",
      "在提升成熟度的同时对 API 进行版本控制，因为破坏性变更需要受控发布",
      "记录 API 的成熟度级别，因为这为消费者设定了明确的期望"
    ],
    "donts": [
      "Don't aim for Level 3 HATEOAS by default because most API consumers prefer explicit documentation over link discovery",
      "Don't use POST for everything just because it works because you lose caching, idempotency, and semantic clarity",
      "Don't confuse Richardson levels with API quality because a well-designed Level 2 API can be superior to a poorly implemented Level 3",
      "Don't ignore content negotiation because supporting JSON, XML, or versioned media types improves interoperability"
    ],
    "donts_zh": [
      "不要默认追求第 3 级 HATEOAS，因为大多数 API 消费者更偏好明确文档而非链接发现",
      "不要因为 POST 能工作就对所有操作使用 POST，因为你会失去缓存、幂等性和语义清晰性",
      "不要将 Richardson 级别与 API 质量混淆，因为设计良好的第 2 级 API 可以优于实现糟糕的第 3 级",
      "不要忽视内容协商，因为支持 JSON、XML 或版本化媒体类型能提升互操作性"
    ],
    "case_study_company": "GitHub",
    "case_study": "GitHub's REST API is a well-known example of a Level 3 (HATEOAS) API. Every response includes hypermedia links that guide clients to related resources and available actions. This design allows the GitHub API to evolve without breaking existing integrations, as clients follow links rather than hardcoding URLs. The API serves billions of requests daily from thousands of third-party integrations.",
    "case_study_zh": "GitHub 的 REST API 是第 3 级（HATEOAS）API 的知名范例。每个响应都包含超媒体链接，引导客户端访问相关资源和可用操作。这种设计允许 GitHub API 在不破坏现有集成的情况下演进，因为客户端跟随链接而非硬编码 URL。该 API 每天服务来自数千个第三方集成的数十亿次请求。",
    "when_not_to_use": [
      "Internal microservice-to-microservice communication where gRPC or messaging is more efficient",
      "Real-time bidirectional communication where WebSocket or SSE is more appropriate",
      "High-performance binary protocols where HTTP overhead is unacceptable",
      "GraphQL-first architectures where the query language replaces REST resource modeling"
    ],
    "when_not_to_use_zh": [
      "内部微服务间通信，gRPC 或消息队列更高效",
      "需要实时双向通信的场景，WebSocket 或 SSE 更合适",
      "HTTP 开销不可接受的高性能二进制协议",
      "GraphQL 优先架构，查询语言取代了 REST 资源建模"
    ],
    "adopters": [
      "GitHub",
      "Stripe",
      "Twilio",
      "PayPal",
      "Atlassian"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability",
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Leonard Richardson (2008). \"Justice Will Take Us Millions of Intricate Moves\". QCon Conference Talk.",
    "secondary_sources": [
      "Martin Fowler (2010). \"Richardson Maturity Model\". martinfowler.com.",
      "Jim Webber, Savas Parastatidis, and Ian Robinson (2010). \"REST in Practice: Hypermedia and Systems Architecture\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "api-versioning-strategies",
        "type": "complement"
      },
      {
        "slug": "contract-testing",
        "type": "complement"
      },
      {
        "slug": "ai-first-api-design",
        "type": "related"
      }
    ]
  },
  {
    "id": 38,
    "name": "Event Sourcing Pattern",
    "name_zh": "事件溯源模式",
    "slug": "event-sourcing-pattern",
    "category": "coding",
    "desc": "Persist state as an immutable append-only sequence of events",
    "desc_zh": "将状态持久化为不可变的追加事件序列而非当前快照",
    "steps": [
      "Model domain events: define immutable event types that capture every meaningful state change (OrderPlaced, ItemShipped, PaymentReceived)",
      "Build the event store: implement an append-only log (EventStoreDB, Kafka, or custom) that stores events with sequence numbers and metadata",
      "Reconstruct state by replay: load an entity's current state by replaying its event stream from the beginning or from a snapshot",
      "Create projections: build read-optimized views by subscribing to the event stream and materializing data into query-friendly shapes",
      "Implement snapshotting: periodically save aggregate state snapshots to avoid replaying the full event history for long-lived entities"
    ],
    "steps_zh": [
      "建模领域事件：定义不可变的事件类型，捕获每个有意义的状态变更（订单创建、物品发货、收款完成）",
      "构建事件存储：实现追加式日志（EventStoreDB、Kafka或自定义），存储带序列号和元数据的事件",
      "通过回放重建状态：从头或从快照开始回放实体的事件流以加载其当前状态",
      "创建投影：通过订阅事件流并将数据物化为查询友好的形态来构建读优化视图",
      "实现快照：定期保存聚合状态快照，避免对长期存活的实体回放完整事件历史"
    ],
    "ai_relevant": false,
    "viz_type": "timeline",
    "viz_labels": [
      "Command",
      "Event",
      "Event Store",
      "Projection",
      "Snapshot"
    ],
    "viz_labels_zh": [
      "命令",
      "事件",
      "事件存储",
      "投影",
      "快照"
    ],
    "related": [
      "cqrs-pattern",
      "eda",
      "ddd-tactical-patterns",
      "domain-driven-design"
    ],
    "tags": [
      "event-sourcing",
      "immutable",
      "append-only",
      "replay",
      "audit-trail"
    ],
    "origin_author": "Greg Young, 2005",
    "origin_source": "CQRS and Event Sourcing (series of conference talks and papers)",
    "origin_source_zh": "《CQRS 与事件溯源》（系列会议演讲和论文）",
    "complexity": "advanced",
    "when_to_use": [
      "Systems requiring a complete audit trail of every state change for compliance or debugging",
      "Financial systems where transaction history must be immutable and reconstructable",
      "Domains with complex temporal queries (what was the state at time T?)",
      "Event-driven architectures where derived read models need to be rebuilt from source events"
    ],
    "when_to_use_zh": [
      "出于合规或调试需要完整审计跟踪每个状态变更的系统",
      "交易历史必须不可变且可重建的金融系统",
      "具有复杂时态查询（T 时刻的状态是什么？）的领域",
      "派生读模型需要从源事件重建的事件驱动架构"
    ],
    "core_concepts": [
      "Event Store: an append-only log that persists domain events in order, serving as the single source of truth",
      "Event Replay: reconstructing an entity's current state by sequentially applying all its past events",
      "Projections: read-optimized materialized views built by processing the event stream into query-friendly data structures",
      "Snapshotting: periodically persisting the current aggregate state to optimize replay performance for long event streams",
      "Temporal Queries: the ability to reconstruct the state of the system at any point in time by replaying events up to that moment"
    ],
    "core_concepts_zh": [
      "事件存储：按顺序持久化领域事件的追加式日志，作为唯一的真相来源",
      "事件回放：通过顺序应用实体的所有过去事件来重建其当前状态",
      "投影：通过将事件流处理为查询友好的数据结构来构建的读优化物化视图",
      "快照：定期持久化当前聚合状态，以优化长事件流的回放性能",
      "时态查询：通过回放到某一时刻的事件来重建系统在任意时间点的状态的能力"
    ],
    "timeline": [
      [
        "2005",
        "Greg Young begins advocating event sourcing alongside CQRS in DDD community talks"
      ],
      [
        "2010",
        "EventStore (now EventStoreDB) is released as a purpose-built event sourcing database"
      ],
      [
        "2014",
        "Martin Fowler publishes his event sourcing overview, bringing the pattern to mainstream attention"
      ],
      [
        "2018",
        "Apache Kafka becomes widely adopted as an event store backbone in microservice architectures"
      ]
    ],
    "timeline_zh": [
      [
        "2005",
        "Greg Young 在 DDD 社区演讲中开始倡导事件溯源与 CQRS"
      ],
      [
        "2010",
        "EventStore（现 EventStoreDB）作为专门构建的事件溯源数据库发布"
      ],
      [
        "2014",
        "Martin Fowler 发表事件溯源概述文章，将该模式带入主流视野"
      ],
      [
        "2018",
        "Apache Kafka 被广泛采用为微服务架构中的事件存储骨干"
      ]
    ],
    "dos": [
      "Do design events as immutable facts about what happened because they are the source of truth",
      "Do version your event schemas because event structure will evolve and old events must remain readable",
      "Do implement snapshotting for aggregates with long histories because replay performance degrades linearly",
      "Do separate write and read models because event-sourced writes are append-only and reads need materialized views"
    ],
    "dos_zh": [
      "将事件设计为关于已发生事实的不可变记录，因为它们是真相来源",
      "对事件 schema 进行版本控制，因为事件结构会演进而旧事件必须保持可读",
      "为历史记录较长的聚合实现快照，因为回放性能随事件数量线性下降",
      "分离写模型和读模型，因为事件溯源的写入是追加式的而读取需要物化视图"
    ],
    "donts": [
      "Don't delete or modify events because immutability is the foundational guarantee of event sourcing",
      "Don't use event sourcing for simple CRUD domains because the complexity overhead is not justified",
      "Don't forget to handle event schema evolution because breaking changes to events can corrupt the entire stream",
      "Don't build a single monolithic projection because different read concerns need different optimized views"
    ],
    "donts_zh": [
      "不要删除或修改事件，因为不可变性是事件溯源的基础保证",
      "不要对简单 CRUD 领域使用事件溯源，因为复杂性开销不值得",
      "不要忘记处理事件 schema 演进，因为事件的破坏性变更可能损坏整个流",
      "不要构建单个巨型投影，因为不同的读取关注点需要不同的优化视图"
    ],
    "case_study_company": "LMAX Exchange",
    "case_study": "LMAX Exchange, a high-frequency trading platform, built its entire trading engine on event sourcing. Every order, trade, and market movement is captured as an immutable event. The system can replay the full trading day in minutes for audit purposes, and the event-sourced architecture enables LMAX to process over 6 million transactions per second with consistent latency under 1 millisecond.",
    "case_study_zh": "高频交易平台 LMAX Exchange 将其整个交易引擎构建在事件溯源之上。每个订单、交易和市场变动都被捕获为不可变事件。系统可在数分钟内回放全天交易数据用于审计，事件溯源架构使 LMAX 能以低于 1 毫秒的一致延迟每秒处理超过 600 万笔交易。",
    "when_not_to_use": [
      "Simple CRUD applications where current state is the only concern",
      "Systems with very high write volumes where event storage costs become prohibitive",
      "Teams without experience in event-driven architecture who would struggle with eventual consistency",
      "Domains where regulatory requirements mandate data deletion (right to be forgotten) incompatible with immutable logs"
    ],
    "when_not_to_use_zh": [
      "只关心当前状态的简单 CRUD 应用",
      "写入量极高、事件存储成本变得难以承受的系统",
      "没有事件驱动架构经验、难以应对最终一致性的团队",
      "法规要求数据删除（被遗忘权）与不可变日志不兼容的领域"
    ],
    "adopters": [
      "LMAX Exchange",
      "Walmart",
      "Capital One",
      "Jet.com",
      "EventStore Ltd"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Greg Young (2010). \"CQRS and Event Sourcing\". cqrs.files.wordpress.com.",
    "secondary_sources": [
      "Martin Fowler (2005). \"Event Sourcing\". martinfowler.com.",
      "Vaughn Vernon (2013). \"Implementing Domain-Driven Design\". Addison-Wesley. Chapter 8."
    ],
    "typed_relations": [
      {
        "slug": "cqrs-pattern",
        "type": "complement"
      },
      {
        "slug": "eda",
        "type": "complement"
      },
      {
        "slug": "ddd-tactical-patterns",
        "type": "complement"
      },
      {
        "slug": "domain-driven-design",
        "type": "prerequisite"
      }
    ]
  },
  {
    "id": 39,
    "name": "Prompt Engineering Patterns",
    "name_zh": "提示工程模式",
    "slug": "prompt-engineering-patterns",
    "category": "coding",
    "desc": "Structured techniques for crafting effective LLM prompts",
    "desc_zh": "构建高效大模型提示词的结构化技术模式集合",
    "steps": [
      "Define the persona and context: set a clear system prompt that establishes the AI's role, expertise level, and behavioral constraints",
      "Use structured output formats: specify JSON schemas, markdown templates, or typed response formats to get parseable, consistent results",
      "Apply few-shot examples: include 2-5 representative input-output examples that demonstrate the desired quality, style, and edge cases",
      "Add chain-of-thought instructions: prompt the model to reason step-by-step before producing a final answer to improve accuracy on complex tasks",
      "Iterate and evaluate: A/B test prompt variants against a golden dataset; track quality metrics and version prompts like code"
    ],
    "steps_zh": [
      "定义角色与上下文：设定清晰的系统提示词，建立AI的角色、专业水平和行为约束",
      "使用结构化输出格式：指定JSON Schema、Markdown模板或类型化响应格式以获得可解析、一致的结果",
      "应用少样本示例：包含2-5个代表性输入输出示例，展示期望的质量、风格和边界情况",
      "添加思维链指令：提示模型在给出最终答案前逐步推理，以提高复杂任务的准确性",
      "迭代与评估：对比测试提示词变体与黄金数据集；追踪质量指标，像管理代码一样管理提示词版本"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "System Prompt",
      "Context",
      "Instruction",
      "Output Format"
    ],
    "viz_labels_zh": [
      "系统提示",
      "上下文",
      "指令",
      "输出格式"
    ],
    "related": [
      "prompt-chaining",
      "tool-use-design-pattern",
      "llm-system-design-patterns"
    ],
    "tags": [
      "prompt-engineering",
      "few-shot",
      "chain-of-thought",
      "llm",
      "ai"
    ],
    "origin_author": "Jason Wei, Xuezhi Wang et al. (Google Brain), 2022",
    "origin_source": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models (NeurIPS 2022)",
    "origin_source_zh": "《思维链提示激发大语言模型推理能力》（NeurIPS 2022）",
    "complexity": "beginner",
    "when_to_use": [
      "Integrating LLMs into production applications that require consistent, high-quality outputs",
      "Building AI-powered features where output format and quality need to be controllable",
      "Reducing hallucination and improving factual accuracy in LLM-generated content",
      "Creating reusable prompt templates that can be versioned and shared across teams"
    ],
    "when_to_use_zh": [
      "将 LLM 集成到需要一致高质量输出的生产应用中",
      "构建需要可控输出格式和质量的 AI 功能",
      "减少 LLM 生成内容中的幻觉并提高事实准确性",
      "创建可版本化、可跨团队共享的可复用提示模板"
    ],
    "core_concepts": [
      "System Prompt: instructions that set the AI's role, constraints, and behavioral guidelines before user interaction",
      "Few-Shot Learning: providing exemplar input-output pairs within the prompt to guide the model's response pattern",
      "Chain-of-Thought (CoT): instructing the model to show its reasoning steps before producing a final answer",
      "Structured Output: constraining the model to respond in a specific format (JSON, XML, markdown) for programmatic parsing",
      "Prompt Versioning: treating prompts as code artifacts with version control, testing, and iterative improvement"
    ],
    "core_concepts_zh": [
      "系统提示：在用户交互前设定 AI 角色、约束和行为准则的指令",
      "少样本学习：在提示中提供示范性输入输出对，引导模型的响应模式",
      "思维链（CoT）：指示模型在给出最终答案前展示推理步骤",
      "结构化输出：约束模型以特定格式（JSON、XML、Markdown）响应，便于程序化解析",
      "提示版本控制：将提示作为代码产物进行版本控制、测试和迭代改进"
    ],
    "timeline": [
      [
        "2020",
        "GPT-3 release demonstrates that prompt design significantly affects output quality, launching the field"
      ],
      [
        "2022",
        "Google Brain publishes the Chain-of-Thought paper, showing step-by-step prompting improves reasoning"
      ],
      [
        "2023",
        "OpenAI introduces function calling and structured outputs, formalizing prompt-to-code integration patterns"
      ],
      [
        "2024",
        "Anthropic, Google, and OpenAI publish prompt engineering guides; the discipline becomes standard practice"
      ]
    ],
    "timeline_zh": [
      [
        "2020",
        "GPT-3 发布证明提示设计显著影响输出质量，开创了该领域"
      ],
      [
        "2022",
        "Google Brain 发表思维链论文，证明逐步提示能改善推理能力"
      ],
      [
        "2023",
        "OpenAI 推出函数调用和结构化输出，正式化了提示到代码的集成模式"
      ],
      [
        "2024",
        "Anthropic、Google 和 OpenAI 发布提示工程指南；该学科成为标准实践"
      ]
    ],
    "dos": [
      "Do be explicit about output format and constraints because ambiguity leads to unpredictable responses",
      "Do provide few-shot examples for complex tasks because they dramatically improve output consistency",
      "Do use chain-of-thought for reasoning tasks because it reduces errors on multi-step problems",
      "Do version and test prompts systematically because small wording changes can cause large behavior shifts"
    ],
    "dos_zh": [
      "明确说明输出格式和约束，因为歧义会导致不可预测的响应",
      "为复杂任务提供少样本示例，因为它们能显著提高输出一致性",
      "对推理任务使用思维链，因为它能减少多步骤问题上的错误",
      "系统性地对提示进行版本控制和测试，因为措辞的微小变化可能导致行为的巨大变化"
    ],
    "donts": [
      "Don't write vague prompts and expect the model to guess your intent because LLMs amplify ambiguity",
      "Don't include contradictory instructions because the model will inconsistently follow one or the other",
      "Don't skip evaluation because a prompt that works on one example may fail on edge cases",
      "Don't hardcode prompts in application code because they need to evolve independently of business logic"
    ],
    "donts_zh": [
      "不要写模糊的提示并期望模型猜测你的意图，因为 LLM 会放大歧义",
      "不要包含相互矛盾的指令，因为模型会不一致地遵循其中之一",
      "不要跳过评估，因为在一个示例上有效的提示可能在边界情况下失败",
      "不要将提示硬编码在应用代码中，因为它们需要独立于业务逻辑进行演进"
    ],
    "case_study_company": "Notion",
    "case_study": "Notion's AI writing assistant uses sophisticated prompt engineering patterns to deliver context-aware writing suggestions. By combining system prompts that define the writing style, few-shot examples of high-quality rewrites, and structured output for formatting, Notion AI can generate, summarize, and translate content while maintaining the user's document context. This approach powered Notion AI's adoption by over 30 million users within its first year.",
    "case_study_zh": "Notion 的 AI 写作助手使用精密的提示工程模式来提供上下文感知的写作建议。通过组合定义写作风格的系统提示、高质量改写的少样本示例，以及用于格式化的结构化输出，Notion AI 能够在保持用户文档上下文的同时生成、摘要和翻译内容。这种方法推动了 Notion AI 在发布第一年内被超过 3000 万用户采用。",
    "when_not_to_use": [
      "Tasks that require deterministic, reproducible results every time (use traditional algorithms instead)",
      "Safety-critical systems where LLM hallucination is an unacceptable risk",
      "Simple rule-based logic that is more reliably implemented with conventional code",
      "Applications where latency and cost of LLM inference are prohibitive"
    ],
    "when_not_to_use_zh": [
      "每次都需要确定性、可复现结果的任务（应使用传统算法）",
      "LLM 幻觉是不可接受风险的安全关键系统",
      "用传统代码实现更可靠的简单规则逻辑",
      "LLM 推理的延迟和成本难以承受的应用"
    ],
    "adopters": [
      "Notion",
      "GitHub (Copilot)",
      "Anthropic",
      "OpenAI",
      "Vercel (v0)"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "usability",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Jason Wei et al. (2022). \"Chain-of-Thought Prompting Elicits Reasoning in Large Language Models\". NeurIPS 2022.",
    "secondary_sources": [
      "Jules White et al. (2023). \"A Prompt Pattern Catalog to Enhance Prompt Engineering with ChatGPT\". arXiv:2302.11382.",
      "Pengfei Liu et al. (2023). \"Pre-train, Prompt, and Predict: A Systematic Survey of Prompting Methods in NLP\". ACM Computing Surveys."
    ],
    "typed_relations": [
      {
        "slug": "prompt-chaining",
        "type": "complement"
      },
      {
        "slug": "tool-use-design-pattern",
        "type": "complement"
      },
      {
        "slug": "llm-system-design-patterns",
        "type": "related"
      }
    ]
  },
  {
    "id": 40,
    "name": "Tool-Use / ReAct Pattern",
    "name_zh": "工具使用 / ReAct 模式",
    "slug": "tool-use-react-pattern",
    "category": "coding",
    "desc": "Enable LLM agents to call external tools in reasoning loops",
    "desc_zh": "使大模型代理能在推理循环中调用外部工具",
    "steps": [
      "Define the tool inventory: catalog each available tool with a typed JSON Schema signature, description, and usage examples",
      "Implement the reasoning loop: prompt the LLM to think about which tool to call, execute the call, observe the result, and decide next steps",
      "Build robust tool handlers: make each tool idempotent, return structured responses with clear error messages, and enforce timeouts",
      "Add selection guardrails: constrain which tools are available per context and validate tool-call arguments before execution",
      "Evaluate tool-use accuracy: build a benchmark of representative tasks and measure tool selection precision, recall, and end-to-end task success"
    ],
    "steps_zh": [
      "定义工具清单：编目每个可用工具及其类型化JSON Schema签名、描述和使用示例",
      "实现推理循环：提示LLM思考调用哪个工具，执行调用，观察结果，并决定下一步",
      "构建健壮的工具处理器：使每个工具幂等，返回结构化响应和清晰的错误信息，并强制执行超时",
      "添加选择护栏：限制每个上下文可用的工具，在执行前验证工具调用参数",
      "评估工具使用准确率：构建代表性任务基准，度量工具选择的精确率、召回率和端到端任务成功率"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Reason",
      "Act",
      "Observe",
      "Reflect"
    ],
    "viz_labels_zh": [
      "推理",
      "行动",
      "观察",
      "反思"
    ],
    "related": [
      "react-framework",
      "prompt-engineering-patterns",
      "prompt-chaining"
    ],
    "tags": [
      "tool-use",
      "react",
      "agents",
      "function-calling",
      "reasoning"
    ],
    "origin_author": "Shunyu Yao, Jeffrey Zhao et al. (Princeton/Google Brain), 2022",
    "origin_source": "ReAct: Synergizing Reasoning and Acting in Language Models (ICLR 2023)",
    "origin_source_zh": "《ReAct：在语言模型中协同推理与行动》（ICLR 2023）",
    "complexity": "advanced",
    "when_to_use": [
      "Building AI agents that need to interact with external APIs, databases, or file systems",
      "Tasks that require combining LLM reasoning with real-time data retrieval",
      "Complex multi-step workflows where the next action depends on previous tool outputs",
      "Building coding assistants that need to read, write, and execute code"
    ],
    "when_to_use_zh": [
      "构建需要与外部 API、数据库或文件系统交互的 AI 代理",
      "需要将 LLM 推理与实时数据检索相结合的任务",
      "下一步操作依赖于前序工具输出的复杂多步骤工作流",
      "构建需要读写和执行代码的编程助手"
    ],
    "core_concepts": [
      "Reasoning Trace: the LLM's explicit thought process about which tool to use and why, visible as intermediate text",
      "Action: a structured tool call with a specific function name and typed arguments",
      "Observation: the result returned from a tool execution that feeds back into the next reasoning step",
      "Tool Schema: a JSON Schema definition of each tool's name, description, parameters, and return type",
      "Grounding: using tool outputs to anchor LLM responses in real data, reducing hallucination"
    ],
    "core_concepts_zh": [
      "推理轨迹：LLM 关于使用哪个工具及其原因的显式思考过程，作为中间文本可见",
      "行动：具有特定函数名和类型化参数的结构化工具调用",
      "观察：工具执行返回的结果，反馈到下一个推理步骤",
      "工具 Schema：每个工具的名称、描述、参数和返回类型的 JSON Schema 定义",
      "接地：使用工具输出将 LLM 响应锚定在真实数据上，减少幻觉"
    ],
    "timeline": [
      [
        "2022",
        "Yao et al. publish the ReAct paper, formalizing the reasoning-action loop for LLMs"
      ],
      [
        "2023",
        "OpenAI launches function calling API; Anthropic introduces tool use in Claude, making ReAct mainstream"
      ],
      [
        "2024",
        "LangChain, LlamaIndex, and Anthropic's Agent SDK standardize tool-use abstractions for production agents"
      ],
      [
        "2025",
        "Multi-agent tool-use frameworks emerge with Claude Code and OpenAI Codex agents as prominent examples"
      ]
    ],
    "timeline_zh": [
      [
        "2022",
        "Yao 等人发表 ReAct 论文，正式定义了 LLM 的推理-行动循环"
      ],
      [
        "2023",
        "OpenAI 推出函数调用 API；Anthropic 在 Claude 中引入工具使用，使 ReAct 成为主流"
      ],
      [
        "2024",
        "LangChain、LlamaIndex 和 Anthropic 的 Agent SDK 标准化了生产级代理的工具使用抽象"
      ],
      [
        "2025",
        "多代理工具使用框架涌现，Claude Code 和 OpenAI Codex 代理是突出代表"
      ]
    ],
    "dos": [
      "Do define tool schemas precisely because vague descriptions lead to incorrect tool selection",
      "Do make tools idempotent because the LLM may retry a tool call if the observation is unclear",
      "Do limit the number of available tools per context because too many options confuse the model",
      "Do log the full reasoning trace because it is essential for debugging and improving agent behavior"
    ],
    "dos_zh": [
      "精确定义工具 schema，因为模糊的描述会导致错误的工具选择",
      "使工具幂等，因为如果观察结果不明确 LLM 可能重试工具调用",
      "限制每个上下文中的可用工具数量，因为选项太多会使模型困惑",
      "记录完整的推理轨迹，因为这对调试和改善代理行为至关重要"
    ],
    "donts": [
      "Don't give tools unrestricted access to sensitive systems because the LLM may call them unexpectedly",
      "Don't skip argument validation before tool execution because malformed inputs cause silent failures",
      "Don't let the reasoning loop run indefinitely because infinite loops waste tokens and time — set a max iteration limit",
      "Don't hardcode tool sequences because the power of ReAct is dynamic tool selection based on observations"
    ],
    "donts_zh": [
      "不要给工具对敏感系统的不受限访问，因为 LLM 可能意外调用它们",
      "不要在工具执行前跳过参数验证，因为格式错误的输入会导致静默失败",
      "不要让推理循环无限运行，因为无限循环浪费 token 和时间——设定最大迭代限制",
      "不要硬编码工具调用序列，因为 ReAct 的力量在于基于观察的动态工具选择"
    ],
    "case_study_company": "Anthropic",
    "case_study": "Anthropic's Claude Code is a production implementation of the ReAct pattern. The agent reasons about which tool to use (file read, code search, bash execution, file edit), executes the tool, observes the result, and decides whether to continue or respond. This reasoning loop allows Claude Code to handle complex multi-file refactoring tasks, debug failing tests, and build features across entire codebases — all through iterative tool use grounded in real code observations.",
    "case_study_zh": "Anthropic 的 Claude Code 是 ReAct 模式的生产级实现。代理推理应使用哪个工具（文件读取、代码搜索、bash 执行、文件编辑），执行工具，观察结果，并决定是继续还是响应。这种推理循环使 Claude Code 能够处理复杂的多文件重构任务、调试失败测试，并在整个代码库中构建功能——全部通过基于真实代码观察的迭代工具使用完成。",
    "when_not_to_use": [
      "Simple question-answering tasks where the LLM's parametric knowledge is sufficient",
      "Latency-critical applications where multiple tool-call round trips are too slow",
      "Tasks with a fixed, known sequence of steps where a traditional pipeline is more reliable",
      "Environments where tool execution costs (API calls, compute) are prohibitively expensive"
    ],
    "when_not_to_use_zh": [
      "LLM 参数知识足够的简单问答任务",
      "多次工具调用往返过慢的延迟敏感应用",
      "步骤序列固定已知、传统管道更可靠的任务",
      "工具执行成本（API 调用、计算）过高的环境"
    ],
    "adopters": [
      "Anthropic (Claude Code)",
      "OpenAI (ChatGPT Plugins)",
      "Google (Gemini)",
      "LangChain",
      "Microsoft (Copilot)"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Shunyu Yao et al. (2023). \"ReAct: Synergizing Reasoning and Acting in Language Models\". ICLR 2023.",
    "secondary_sources": [
      "Timo Schick et al. (2023). \"Toolformer: Language Models Can Teach Themselves to Use Tools\". NeurIPS 2023.",
      "Takeshi Kojima et al. (2022). \"Large Language Models are Zero-Shot Reasoners\". NeurIPS 2022."
    ],
    "typed_relations": [
      {
        "slug": "react-framework",
        "type": "extends"
      },
      {
        "slug": "prompt-engineering-patterns",
        "type": "complement"
      },
      {
        "slug": "prompt-chaining",
        "type": "complement"
      }
    ]
  },
  {
    "id": 41,
    "name": "Conventional Comments",
    "name_zh": "约定式评论",
    "slug": "conventional-comments",
    "category": "coding",
    "desc": "Prefixed code review comments for clarity and actionability",
    "desc_zh": "带前缀的代码评审注释，提升清晰度与可操作性",
    "steps": [
      "Learn the label taxonomy: use prefixes like suggestion:, issue:, question:, praise:, nitpick:, thought: to categorize each comment",
      "Add decorators for urgency: append (blocking) or (non-blocking) to signal whether the comment must be resolved before merge",
      "Write the comment body: after the label, provide a clear explanation of what and why, with a suggested fix when possible",
      "Adopt team-wide: add Conventional Comments guidelines to the team's contributing guide and code review checklist",
      "Measure review quality: track resolution rates per label type and use feedback to improve review culture over time"
    ],
    "steps_zh": [
      "学习标签分类法：使用suggestion:、issue:、question:、praise:、nitpick:、thought:等前缀对每条评论分类",
      "添加紧急度修饰：附加(blocking)或(non-blocking)以标识该评论是否必须在合并前解决",
      "撰写评论正文：在标签后提供清晰的解释（什么和为什么），尽可能附上建议的修复方案",
      "全团队采纳：将约定式评论指南添加到团队贡献指南和代码评审清单中",
      "衡量评审质量：追踪各标签类型的解决率，利用反馈持续改善评审文化"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Label",
      "Subject",
      "Decoration",
      "Discussion"
    ],
    "viz_labels_zh": [
      "标签",
      "主题",
      "修饰词",
      "评论内容"
    ],
    "related": [
      "clean-code-principles",
      "ai-pair-programming",
      "semantic-versioning"
    ],
    "tags": [
      "code-review",
      "comments",
      "collaboration",
      "conventions"
    ],
    "origin_author": "Paul Slaughter, 2020",
    "origin_source": "conventionalcomments.org",
    "origin_source_zh": "conventionalcomments.org 网站",
    "complexity": "beginner",
    "when_to_use": [
      "Code reviews where reviewers want to clearly communicate intent and priority of feedback",
      "Teams experiencing misunderstandings about whether review comments are blocking or advisory",
      "Open source projects with diverse contributors who need a shared feedback language",
      "Organizations wanting to measure and improve code review culture with data"
    ],
    "when_to_use_zh": [
      "评审者希望清晰传达反馈意图和优先级的代码评审",
      "团队对评审评论是阻塞性还是建议性存在误解时",
      "需要共享反馈语言的多元贡献者开源项目",
      "希望用数据衡量和改善代码评审文化的组织"
    ],
    "core_concepts": [
      "Labels: prefixes (suggestion, issue, question, praise, nitpick, thought) that categorize the type of feedback",
      "Decorators: modifiers like (blocking) and (non-blocking) that communicate urgency and merge requirements",
      "Structured Body: the explanation that follows the label, providing context, rationale, and optionally a suggested fix",
      "Praise: explicitly calling out good code encourages positive behavior and makes reviews feel balanced",
      "Measurability: labeled comments can be aggregated and analyzed to track review patterns and team health"
    ],
    "core_concepts_zh": [
      "标签：对反馈类型进行分类的前缀（suggestion、issue、question、praise、nitpick、thought）",
      "修饰符：传达紧急程度和合并要求的修饰语，如 (blocking) 和 (non-blocking)",
      "结构化正文：标签后的解释部分，提供上下文、理由，以及可选的建议修复方案",
      "赞扬：明确指出好的代码能鼓励积极行为，使评审感觉更平衡",
      "可度量性：有标签的评论可以被聚合分析，追踪评审模式和团队健康度"
    ],
    "timeline": [
      [
        "2020",
        "Paul Slaughter publishes conventionalcomments.org, formalizing the labeling system"
      ],
      [
        "2021",
        "GitLab adopts Conventional Comments as part of their internal code review guidelines"
      ],
      [
        "2022",
        "Browser extensions and IDE plugins emerge to autocomplete Conventional Comments labels"
      ],
      [
        "2024",
        "AI code review tools (GitHub Copilot, CodeRabbit) integrate conventional comment labels into automated reviews"
      ]
    ],
    "timeline_zh": [
      [
        "2020",
        "Paul Slaughter 发布 conventionalcomments.org，正式化标签体系"
      ],
      [
        "2021",
        "GitLab 将约定式评论纳入其内部代码评审指南"
      ],
      [
        "2022",
        "浏览器扩展和 IDE 插件出现，支持约定式评论标签的自动补全"
      ],
      [
        "2024",
        "AI 代码评审工具（GitHub Copilot、CodeRabbit）在自动化评审中集成约定式评论标签"
      ]
    ],
    "dos": [
      "Do always include a label prefix because it immediately communicates the nature of the feedback",
      "Do mark comments as (non-blocking) when they are suggestions so authors know they can merge without addressing them",
      "Do use praise: labels generously because positive reinforcement improves team morale and review quality",
      "Do provide suggested fixes alongside issue: and suggestion: comments because it reduces back-and-forth"
    ],
    "dos_zh": [
      "始终包含标签前缀，因为它能立即传达反馈的性质",
      "当评论是建议时标记为 (non-blocking)，让作者知道可以不处理直接合并",
      "慷慨使用 praise: 标签，因为正面强化能提升团队士气和评审质量",
      "在 issue: 和 suggestion: 评论中附上建议的修复方案，因为这减少了来回沟通"
    ],
    "donts": [
      "Don't leave ambiguous comments without labels because the author cannot tell if it is blocking or optional",
      "Don't overuse nitpick: for substantive issues because it trains authors to ignore important feedback",
      "Don't write comment bodies without explaining 'why' because context is essential for learning",
      "Don't introduce Conventional Comments without team buy-in because inconsistent adoption creates more confusion"
    ],
    "donts_zh": [
      "不要留下没有标签的模糊评论，因为作者无法判断它是阻塞性还是可选的",
      "不要对实质性问题过度使用 nitpick:，因为这会训练作者忽略重要反馈",
      "不要在评论正文中不解释「为什么」，因为上下文对学习至关重要",
      "不要在没有团队认同的情况下引入约定式评论，因为不一致的采用会制造更多困惑"
    ],
    "case_study_company": "GitLab",
    "case_study": "GitLab adopted Conventional Comments across its engineering organization to standardize how feedback is given in merge requests. By labeling every comment with a category and blocking/non-blocking decorator, GitLab reduced average merge request review cycles from 3.2 rounds to 1.8 rounds. The structured format also enabled GitLab to build dashboards tracking review patterns and identify teams that needed coaching.",
    "case_study_zh": "GitLab 在整个工程组织中采用了约定式评论，以标准化合并请求中的反馈方式。通过为每条评论标注类别和阻塞/非阻塞修饰符，GitLab 将平均合并请求评审周期从 3.2 轮减少到 1.8 轮。结构化格式还使 GitLab 能够构建追踪评审模式的仪表板，识别需要辅导的团队。",
    "when_not_to_use": [
      "Solo developers with no code review process",
      "Pair programming sessions where feedback is given verbally in real time",
      "Very small teams where informal communication is sufficient and overhead is unwanted",
      "Automated CI feedback where structured linter output replaces human comments"
    ],
    "when_not_to_use_zh": [
      "没有代码评审流程的独立开发者",
      "结对编程会话中反馈以实时口头方式给出",
      "非正式沟通已足够且不需要额外开销的极小团队",
      "结构化 linter 输出替代人工评论的自动化 CI 反馈"
    ],
    "adopters": [
      "GitLab",
      "Shopify",
      "HashiCorp",
      "DigitalOcean",
      "Sourcegraph"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Paul Slaughter (2020). \"Conventional Comments\". conventionalcomments.org.",
    "secondary_sources": [
      "Google Engineering Practices (2019). \"How to Do a Code Review\". google.github.io/eng-practices.",
      "Karl Wiegers (2002). \"Peer Reviews in Software: A Practical Guide\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "clean-code-principles",
        "type": "complement"
      },
      {
        "slug": "ai-pair-programming",
        "type": "complement"
      },
      {
        "slug": "semantic-versioning",
        "type": "related"
      }
    ]
  },
  {
    "id": 42,
    "name": "Semantic Versioning (SemVer)",
    "name_zh": "语义化版本控制",
    "slug": "semantic-versioning",
    "category": "coding",
    "desc": "Version APIs and libraries with MAJOR.MINOR.PATCH semantics",
    "desc_zh": "使用主版本.次版本.补丁版本语义对API和库进行版本控制",
    "steps": [
      "Understand the contract: MAJOR for breaking changes, MINOR for backward-compatible features, PATCH for backward-compatible bug fixes",
      "Define your public API surface: explicitly document which interfaces, endpoints, or exports constitute the versioned contract",
      "Automate version bumps: use commit message conventions (Conventional Commits) and tools (semantic-release) to compute the next version",
      "Communicate changes: maintain a CHANGELOG that maps each version to its features, fixes, and breaking changes with migration guides",
      "Enforce in CI: validate that breaking changes trigger MAJOR bumps; use contract tests to detect unintentional API surface changes"
    ],
    "steps_zh": [
      "理解契约：MAJOR用于破坏性变更，MINOR用于向后兼容的新功能，PATCH用于向后兼容的缺陷修复",
      "定义公共API面：明确记录哪些接口、端点或导出构成版本化契约",
      "自动化版本升级：使用提交消息约定（Conventional Commits）和工具（semantic-release）计算下一个版本",
      "传达变更：维护CHANGELOG，将每个版本映射到其功能、修复和破坏性变更，附带迁移指南",
      "在CI中强制执行：验证破坏性变更触发MAJOR升级；使用契约测试检测非预期的API面变更"
    ],
    "ai_relevant": false,
    "viz_type": "timeline",
    "viz_labels": [
      "Major",
      "Minor",
      "Patch",
      "Pre-release"
    ],
    "viz_labels_zh": [
      "主版本",
      "次版本",
      "修订版本",
      "预发布"
    ],
    "related": [
      "api-versioning-strategies",
      "contract-testing",
      "conventional-comments"
    ],
    "tags": [
      "versioning",
      "semver",
      "api",
      "changelog",
      "release-management"
    ],
    "origin_author": "Tom Preston-Werner, 2011",
    "origin_source": "Semantic Versioning 2.0.0 (semver.org)",
    "origin_source_zh": "《语义化版本 2.0.0》（semver.org）",
    "complexity": "beginner",
    "when_to_use": [
      "Publishing libraries or SDKs consumed by external developers who need dependency stability",
      "Managing API versions for public or partner-facing services",
      "Automating release pipelines where version numbers drive deployment decisions",
      "Any shared package in a monorepo or package registry where consumers need predictable upgrade paths"
    ],
    "when_to_use_zh": [
      "发布供外部开发者使用且需要依赖稳定性的库或 SDK",
      "管理面向公众或合作伙伴服务的 API 版本",
      "自动化发布管道中版本号驱动部署决策的场景",
      "monorepo 或包注册表中消费者需要可预测升级路径的共享包"
    ],
    "core_concepts": [
      "MAJOR version: incremented for incompatible API changes that require consumers to modify their code",
      "MINOR version: incremented for new functionality added in a backward-compatible manner",
      "PATCH version: incremented for backward-compatible bug fixes that do not add new features",
      "Pre-release identifiers: suffixes like -alpha.1, -beta.2, -rc.1 that indicate unstable versions",
      "Public API contract: the explicitly defined surface (functions, types, endpoints) that the version number protects"
    ],
    "core_concepts_zh": [
      "MAJOR 版本：不兼容的 API 变更，需要消费者修改代码时递增",
      "MINOR 版本：以向后兼容方式添加新功能时递增",
      "PATCH 版本：不添加新功能的向后兼容缺陷修复时递增",
      "预发布标识：如 -alpha.1、-beta.2、-rc.1 等后缀，表示不稳定版本",
      "公共 API 契约：版本号所保护的显式定义的表面（函数、类型、端点）"
    ],
    "timeline": [
      [
        "2009",
        "Tom Preston-Werner drafts the first version of the SemVer specification"
      ],
      [
        "2011",
        "Semantic Versioning 2.0.0 is published at semver.org, becoming the widely adopted standard"
      ],
      [
        "2015",
        "Conventional Commits specification emerges, enabling automated SemVer bumps from commit messages"
      ],
      [
        "2017",
        "semantic-release and similar tools automate the entire version-bump-publish pipeline in CI/CD"
      ]
    ],
    "timeline_zh": [
      [
        "2009",
        "Tom Preston-Werner 起草了 SemVer 规范的第一个版本"
      ],
      [
        "2011",
        "语义化版本 2.0.0 在 semver.org 发布，成为被广泛采用的标准"
      ],
      [
        "2015",
        "Conventional Commits 规范出现，使从提交消息自动化 SemVer 版本升级成为可能"
      ],
      [
        "2017",
        "semantic-release 等工具在 CI/CD 中自动化了整个版本升级-发布管道"
      ]
    ],
    "dos": [
      "Do define your public API explicitly because unclear API boundaries lead to accidental breaking changes",
      "Do use Conventional Commits to automate version bumps because manual versioning is error-prone",
      "Do maintain a CHANGELOG because consumers need to understand what changed before upgrading",
      "Do start at 0.y.z during initial development because the 0.x convention signals instability"
    ],
    "dos_zh": [
      "明确定义公共 API，因为不清晰的 API 边界会导致意外的破坏性变更",
      "使用 Conventional Commits 自动化版本升级，因为手动版本控制容易出错",
      "维护 CHANGELOG，因为消费者在升级前需要了解发生了什么变化",
      "在初始开发期间从 0.y.z 开始，因为 0.x 约定表示不稳定"
    ],
    "donts": [
      "Don't bump MAJOR for internal changes that don't affect the public API because it creates unnecessary upgrade friction",
      "Don't release breaking changes as MINOR or PATCH because it silently breaks consumer builds",
      "Don't use SemVer for artifacts without a defined public API (like applications) because the contract is meaningless",
      "Don't skip version numbers because gaps confuse consumers and suggest missing releases"
    ],
    "donts_zh": [
      "不要为不影响公共 API 的内部变更升级 MAJOR，因为这造成不必要的升级阻力",
      "不要将破坏性变更作为 MINOR 或 PATCH 发布，因为这会悄悄破坏消费者的构建",
      "不要对没有定义公共 API 的产物（如应用程序）使用 SemVer，因为契约没有意义",
      "不要跳过版本号，因为间隔会让消费者困惑并暗示有遗漏的发布"
    ],
    "case_study_company": "npm (Node.js)",
    "case_study": "npm, the Node.js package registry, adopted Semantic Versioning as the standard for all published packages. The package.json dependency resolution system (^, ~, ranges) is built directly on SemVer semantics. This enabled npm to support over 2 million packages with automated dependency updates via tools like Dependabot and Renovate, while giving developers confidence that PATCH and MINOR updates won't break their builds.",
    "case_study_zh": "Node.js 包注册表 npm 将语义化版本控制采纳为所有发布包的标准。package.json 的依赖解析系统（^、~、范围）直接构建在 SemVer 语义之上。这使 npm 能够支持超过 200 万个包，通过 Dependabot 和 Renovate 等工具实现自动化依赖更新，同时让开发者有信心 PATCH 和 MINOR 更新不会破坏其构建。",
    "when_not_to_use": [
      "Internal applications or services that are not consumed as versioned dependencies",
      "Continuously deployed services where every commit is a release and version numbers are timestamps or commit hashes",
      "Monorepo packages that are always released together and share a single version number",
      "Marketing or product version numbers that serve branding rather than technical compatibility purposes"
    ],
    "when_not_to_use_zh": [
      "不作为版本化依赖被消费的内部应用或服务",
      "每次提交即发布、版本号为时间戳或提交哈希的持续部署服务",
      "总是一起发布且共享单一版本号的 monorepo 包",
      "用于品牌而非技术兼容性目的的市场营销或产品版本号"
    ],
    "adopters": [
      "npm",
      "Rust (Cargo)",
      "Go modules",
      "Maven Central",
      "PyPI"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Tom Preston-Werner (2011). \"Semantic Versioning 2.0.0\". semver.org.",
    "secondary_sources": [
      "Russ Cox (2019). \"Semantic Import Versioning\". research.swtch.com.",
      "IETF RFC 2119, Scott Bradner (1997). \"Key words for use in RFCs to Indicate Requirement Levels\". IETF."
    ],
    "typed_relations": [
      {
        "slug": "api-versioning-strategies",
        "type": "complement"
      },
      {
        "slug": "contract-testing",
        "type": "complement"
      },
      {
        "slug": "conventional-comments",
        "type": "complement"
      }
    ]
  },
  {
    "id": 43,
    "name": "Contract Testing",
    "name_zh": "契约测试",
    "slug": "contract-testing",
    "category": "coding",
    "desc": "Verify service interactions via shared consumer-provider contracts",
    "desc_zh": "通过共享的消费者-提供者契约验证服务间交互的正确性",
    "steps": [
      "Define consumer expectations: in each consumer service, write contract tests that describe the exact requests it makes and responses it expects",
      "Publish contracts to a broker: use a tool like Pact or Spring Cloud Contract to share consumer contracts with the provider team via a central broker",
      "Verify on the provider side: run the published contracts against the actual provider implementation to confirm it satisfies all consumer expectations",
      "Integrate into CI/CD: run contract verification on both consumer and provider pipelines; block deployment if contracts are broken",
      "Evolve contracts safely: use the broker's compatibility matrix (can-i-deploy) to verify that all deployed versions are mutually compatible before release"
    ],
    "steps_zh": [
      "定义消费者期望：在每个消费者服务中编写契约测试，描述其发出的精确请求和期望的响应",
      "将契约发布到代理：使用Pact或Spring Cloud Contract等工具通过中央代理与提供者团队共享消费者契约",
      "在提供者端验证：针对提供者的实际实现运行已发布的契约，确认其满足所有消费者期望",
      "集成到CI/CD：在消费者和提供者的流水线中运行契约验证；契约被破坏时阻止部署",
      "安全演进契约：使用代理的兼容性矩阵（can-i-deploy）在发布前验证所有已部署版本的互兼容性"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Consumer",
      "Contract",
      "Provider",
      "Verification"
    ],
    "viz_labels_zh": [
      "消费者",
      "契约",
      "提供者",
      "验证"
    ],
    "related": [
      "semantic-versioning",
      "api-versioning-strategies",
      "richardson-maturity-model",
      "design-by-contract"
    ],
    "tags": [
      "contract-testing",
      "pact",
      "consumer-driven",
      "api-compatibility",
      "microservices"
    ],
    "origin_author": "Ian Robinson, Martin Fowler, 2006",
    "origin_source": "Consumer-Driven Contracts: A Service Evolution Pattern (Martin Fowler's blog)",
    "origin_source_zh": "《消费者驱动的契约：一种服务演进模式》（Martin Fowler 博客）",
    "complexity": "intermediate",
    "when_to_use": [
      "Microservice architectures where services are deployed independently by different teams",
      "API evolution scenarios where providers need confidence they won't break existing consumers",
      "Replacing slow end-to-end integration tests with fast, focused contract verification",
      "Organizations adopting consumer-driven development where consumers define the API they need"
    ],
    "when_to_use_zh": [
      "不同团队独立部署服务的微服务架构",
      "提供者需要确信不会破坏现有消费者的 API 演进场景",
      "用快速、聚焦的契约验证替代缓慢的端到端集成测试",
      "采用消费者驱动开发、由消费者定义所需 API 的组织"
    ],
    "core_concepts": [
      "Consumer Contract: a test written by the consumer that specifies the exact request/response it expects from a provider",
      "Provider Verification: running consumer contracts against the real provider to ensure it meets all expectations",
      "Pact Broker: a central repository that stores contracts and tracks compatibility between consumer and provider versions",
      "Can-I-Deploy: a pre-deployment check that verifies all interacting service versions are compatible based on verified contracts",
      "Consumer-Driven: the philosophy that consumers define the API contract, and providers must satisfy those needs"
    ],
    "core_concepts_zh": [
      "消费者契约：由消费者编写的测试，指定其期望从提供者获得的精确请求/响应",
      "提供者验证：针对真实提供者运行消费者契约，确保其满足所有期望",
      "Pact Broker：存储契约并追踪消费者与提供者版本间兼容性的中央仓库",
      "Can-I-Deploy：基于已验证契约检查所有交互服务版本是否兼容的预部署检查",
      "消费者驱动：消费者定义 API 契约，提供者必须满足这些需求的理念"
    ],
    "timeline": [
      [
        "2006",
        "Ian Robinson publishes 'Consumer-Driven Contracts' on Martin Fowler's blog"
      ],
      [
        "2013",
        "Pact, the first consumer-driven contract testing framework, is created by REA Group in Australia"
      ],
      [
        "2016",
        "Spring Cloud Contract is released, bringing contract testing to the Spring/Java ecosystem"
      ],
      [
        "2020",
        "Pactflow (commercial Pact Broker) launches, making enterprise-grade contract testing accessible"
      ]
    ],
    "timeline_zh": [
      [
        "2006",
        "Ian Robinson 在 Martin Fowler 博客上发表《消费者驱动的契约》"
      ],
      [
        "2013",
        "第一个消费者驱动契约测试框架 Pact 由澳大利亚 REA Group 创建"
      ],
      [
        "2016",
        "Spring Cloud Contract 发布，将契约测试带入 Spring/Java 生态系统"
      ],
      [
        "2020",
        "Pactflow（商业 Pact Broker）推出，使企业级契约测试变得易于采用"
      ]
    ],
    "dos": [
      "Do write contracts from the consumer's perspective because they define the actual usage patterns",
      "Do run provider verification in CI because manual verification is unreliable and easy to skip",
      "Do use the can-i-deploy check before every deployment because it prevents incompatible versions from reaching production",
      "Do keep contracts minimal and focused because over-specified contracts become brittle"
    ],
    "dos_zh": [
      "从消费者角度编写契约，因为它们定义了实际的使用模式",
      "在 CI 中运行提供者验证，因为手动验证不可靠且容易被跳过",
      "在每次部署前使用 can-i-deploy 检查，因为它能防止不兼容版本到达生产环境",
      "保持契约最小化且聚焦，因为过度指定的契约会变得脆弱"
    ],
    "donts": [
      "Don't test implementation details in contracts because contracts should verify behavior, not internal structure",
      "Don't skip the Pact Broker because without it you lose version tracking and the can-i-deploy safety net",
      "Don't treat contract tests as a replacement for all integration tests because they verify interactions, not end-to-end flows",
      "Don't let provider teams write consumer contracts because it defeats the consumer-driven purpose"
    ],
    "donts_zh": [
      "不要在契约中测试实现细节，因为契约应验证行为而非内部结构",
      "不要跳过 Pact Broker，因为没有它你会失去版本追踪和 can-i-deploy 安全网",
      "不要将契约测试视为所有集成测试的替代，因为它们验证交互而非端到端流程",
      "不要让提供者团队编写消费者契约，因为这违背了消费者驱动的宗旨"
    ],
    "case_study_company": "Atlassian",
    "case_study": "Atlassian adopted Pact-based contract testing across their microservice ecosystem powering Jira, Confluence, and Bitbucket. With over 800 microservices, end-to-end integration testing was too slow and flaky. Contract testing reduced their integration test suite runtime from hours to minutes and cut production API compatibility incidents by 70%. The Pact Broker's can-i-deploy feature became a mandatory gate in their deployment pipeline.",
    "case_study_zh": "Atlassian 在其支撑 Jira、Confluence 和 Bitbucket 的微服务生态系统中采用了基于 Pact 的契约测试。拥有超过 800 个微服务，端到端集成测试太慢且不稳定。契约测试将其集成测试套件运行时间从数小时缩短到数分钟，并将生产环境 API 兼容性事故减少了 70%。Pact Broker 的 can-i-deploy 功能成为其部署管道中的强制门禁。",
    "when_not_to_use": [
      "Monolithic applications where all components are deployed together",
      "Simple two-service architectures where a shared integration test is sufficient",
      "Teams without CI/CD pipelines to automate contract verification",
      "Prototyping phases where APIs change rapidly and contracts would be constantly broken"
    ],
    "when_not_to_use_zh": [
      "所有组件一起部署的单体应用",
      "共享集成测试已足够的简单双服务架构",
      "没有 CI/CD 管道来自动化契约验证的团队",
      "API 快速变化、契约会不断被破坏的原型开发阶段"
    ],
    "adopters": [
      "Atlassian",
      "REA Group",
      "ING Bank",
      "Booking.com",
      "SEEK"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability",
      "testability"
    ],
    "maturity_ring": "established",
    "primary_source": "Ian Robinson (2006). \"Consumer-Driven Contracts: A Service Evolution Pattern\". martinfowler.com.",
    "secondary_sources": [
      "Martin Fowler (2011). \"Integration Contract Test\". martinfowler.com.",
      "Beth Skurrie et al. (2017). \"Pact: Consumer-Driven Contract Testing\". pact.io."
    ],
    "typed_relations": [
      {
        "slug": "semantic-versioning",
        "type": "complement"
      },
      {
        "slug": "api-versioning-strategies",
        "type": "complement"
      },
      {
        "slug": "richardson-maturity-model",
        "type": "complement"
      },
      {
        "slug": "design-by-contract",
        "type": "extends"
      }
    ]
  },
  {
    "id": 173,
    "name": "Strangler Fig at Code Level",
    "name_zh": "代码级绞杀者模式",
    "slug": "strangler-fig-at-code-level",
    "category": "coding",
    "desc": "Incrementally replace legacy code modules by wrapping them and redirecting calls to new implementations",
    "desc_zh": "通过包装遗留代码模块并将调用重定向到新实现来渐进式替换旧代码",
    "steps": [
      "Identify the legacy module: select a specific class, function, or module whose behavior you need to replace, and write characterization tests to capture its current behavior",
      "Create the new implementation: build the replacement module alongside the legacy one, implementing the same interface or contract with improved design",
      "Introduce an abstraction layer: place an adapter, facade, or routing layer between callers and the legacy module so you can redirect traffic without changing call sites",
      "Incrementally redirect: route calls from the legacy module to the new implementation one method or code path at a time, verifying behavior with tests after each switch",
      "Remove the legacy module: once all calls are routed to the new implementation and tests pass, delete the old code and simplify the abstraction layer if it is no longer needed"
    ],
    "steps_zh": [
      "识别遗留模块：选择需要替换的特定类、函数或模块，编写特征测试来捕获其当前行为",
      "创建新实现：在遗留模块旁构建替代模块，以改进的设计实现相同的接口或契约",
      "引入抽象层：在调用方与遗留模块之间放置适配器、外观或路由层，使你可以在不更改调用点的情况下重定向流量",
      "渐进式重定向：逐个方法或代码路径地将调用从遗留模块路由到新实现，每次切换后用测试验证行为",
      "移除遗留模块：一旦所有调用都已路由到新实现且测试通过，删除旧代码并在不再需要时简化抽象层"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Legacy Code",
      "Strangler Facade",
      "New Module",
      "Migration"
    ],
    "viz_labels_zh": [
      "遗留代码",
      "绞杀者门面",
      "新模块",
      "迁移路径"
    ],
    "related": [
      "strangler-fig-pattern",
      "branch-by-abstraction",
      "design-by-contract"
    ],
    "tags": [
      "refactoring",
      "legacy-code",
      "incremental-replacement",
      "strangler-fig",
      "migration"
    ],
    "origin_author": "Martin Fowler, 2004; Michael Feathers, 2004",
    "origin_source": "Refactoring (Fowler, 2018); Working Effectively with Legacy Code (Feathers, 2004)",
    "origin_source_zh": "《重构》（Fowler，2018）；《修改代码的艺术》（Feathers，2004）",
    "complexity": "advanced",
    "when_to_use": [
      "Replacing a tangled legacy module that cannot be safely rewritten all at once",
      "Migrating from an outdated internal library to a modern replacement without halting feature development",
      "Refactoring a critical code path where a big-bang rewrite carries too much risk",
      "Transitioning between different design patterns within a codebase incrementally"
    ],
    "when_to_use_zh": [
      "替换无法安全地一次性重写的纠缠遗留模块",
      "从过时的内部库迁移到现代替代品，同时不中断功能开发",
      "重构关键代码路径，整体重写风险过高时",
      "在代码库中渐进式地在不同设计模式之间转换"
    ],
    "core_concepts": [
      "Characterization Tests: tests written to document the existing behavior of legacy code, forming a safety net before any changes are made",
      "Abstraction Layer: a facade, adapter, or interface placed between callers and the legacy module to enable transparent redirection",
      "Incremental Replacement: replacing legacy behavior one code path at a time rather than attempting a complete rewrite",
      "Parallel Running: keeping both old and new implementations available simultaneously so you can compare outputs and roll back if needed",
      "Legacy Module Retirement: the final step of removing the old code once all traffic has been verified on the new implementation"
    ],
    "core_concepts_zh": [
      "特征测试：编写用于记录遗留代码现有行为的测试，在做任何更改之前形成安全网",
      "抽象层：放置在调用方与遗留模块之间的外观、适配器或接口，以实现透明重定向",
      "渐进式替换：逐个代码路径替换遗留行为，而非尝试完全重写",
      "并行运行：同时保持新旧实现可用，以便比较输出并在需要时回滚",
      "遗留模块退役：在所有流量都已在新实现上验证后移除旧代码的最终步骤"
    ],
    "timeline": [
      [
        "2004",
        "Martin Fowler publishes the Strangler Fig Application article, describing system-level incremental migration"
      ],
      [
        "2004",
        "Michael Feathers publishes 'Working Effectively with Legacy Code', providing code-level techniques for wrapping and replacing legacy modules"
      ],
      [
        "2010",
        "The technique gains traction in large-scale enterprise refactoring projects at companies like ThoughtWorks"
      ],
      [
        "2018",
        "Fowler's second edition of 'Refactoring' formalizes incremental code-level replacement patterns"
      ]
    ],
    "timeline_zh": [
      [
        "2004",
        "Martin Fowler 发表绞杀者无花果应用文章，描述系统级渐进式迁移"
      ],
      [
        "2004",
        "Michael Feathers 出版《修改代码的艺术》，提供包装和替换遗留模块的代码级技术"
      ],
      [
        "2010",
        "该技术在 ThoughtWorks 等公司的大规模企业重构项目中获得广泛采用"
      ],
      [
        "2018",
        "Fowler 的《重构》第二版正式化了渐进式代码级替换模式"
      ]
    ],
    "dos": [
      "Do write characterization tests before touching legacy code because they are your safety net against behavioral regressions",
      "Do keep the abstraction layer thin because a heavyweight adapter becomes its own maintenance burden",
      "Do replace one code path at a time because small increments let you catch problems early and roll back easily",
      "Do delete the old code promptly after migration because leaving dead code creates confusion and false dependencies"
    ],
    "dos_zh": [
      "在修改遗留代码之前编写特征测试，因为它们是防止行为回归的安全网",
      "保持抽象层轻薄，因为笨重的适配器本身会成为维护负担",
      "一次替换一个代码路径，因为小增量让你能尽早发现问题并轻松回滚",
      "迁移完成后及时删除旧代码，因为留下死代码会造成混淆和虚假依赖"
    ],
    "donts": [
      "Don't attempt to replace the entire module in one commit because it defeats the purpose of incremental migration",
      "Don't skip characterization tests because without them you have no way to verify behavioral equivalence",
      "Don't let the abstraction layer become permanent middleware because it should be temporary scaffolding",
      "Don't mix functional changes with migration changes because it makes it impossible to isolate the source of defects"
    ],
    "donts_zh": [
      "不要试图在一次提交中替换整个模块，因为这违背了渐进式迁移的初衷",
      "不要跳过特征测试，因为没有它们你无法验证行为等价性",
      "不要让抽象层变成永久中间件，因为它应该是临时脚手架",
      "不要将功能变更与迁移变更混在一起，因为这使得无法隔离缺陷来源"
    ],
    "case_study_company": "GitHub",
    "case_study": "GitHub incrementally replaced its legacy Ruby permission system with a new authorization service over 18 months. They introduced a thin routing layer that checked a feature flag to decide whether to call the old Ruby module or the new service for each permission check. Teams migrated one resource type at a time (repositories, then organizations, then actions), running both systems in parallel and comparing results. The legacy permission code was fully retired without any user-facing disruption.",
    "case_study_zh": "GitHub 在 18 个月内渐进式地将其遗留 Ruby 权限系统替换为新的授权服务。他们引入了一个薄路由层，通过功能标志决定每次权限检查是调用旧的 Ruby 模块还是新服务。团队逐个资源类型进行迁移（先是仓库，然后是组织，最后是操作），并行运行两个系统并比较结果。遗留权限代码被完全退役，没有任何面向用户的中断。",
    "when_not_to_use": [
      "Small modules that can be safely rewritten and replaced in a single commit",
      "Code with no callers or very few call sites where a direct swap is simpler",
      "Prototyping or throwaway code where long-term maintainability is irrelevant",
      "Situations where the legacy module has no observable side effects and can be unit-tested in isolation"
    ],
    "when_not_to_use_zh": [
      "可以在单次提交中安全重写和替换的小模块",
      "没有调用方或调用点极少、直接替换更简单的代码",
      "原型或一次性代码，长期可维护性无关紧要",
      "遗留模块没有可观察的副作用且可以独立进行单元测试的情况"
    ],
    "adopters": [
      "GitHub",
      "ThoughtWorks",
      "Shopify",
      "Stripe",
      "SoundCloud"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Martin Fowler (2004). \"StranglerFigApplication\". martinfowler.com.",
    "secondary_sources": [
      "Michael Feathers (2004). \"Working Effectively with Legacy Code\". Prentice Hall.",
      "Martin Fowler (2018). \"Refactoring: Improving the Design of Existing Code, 2nd Edition\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "strangler-fig-pattern",
        "type": "extends"
      },
      {
        "slug": "branch-by-abstraction",
        "type": "complement"
      },
      {
        "slug": "design-by-contract",
        "type": "complement"
      }
    ]
  },
  {
    "id": 174,
    "name": "Feature Toggles at Code Level",
    "name_zh": "代码级功能开关",
    "slug": "feature-toggles-at-code-level",
    "category": "coding",
    "desc": "Control code execution paths using conditional branching to enable or disable features without redeployment",
    "desc_zh": "使用条件分支控制代码执行路径，无需重新部署即可启用或禁用功能",
    "steps": [
      "Define toggle points: identify the exact locations in your code where behavior should diverge based on a feature flag, keeping them at the highest possible level",
      "Implement toggle infrastructure: create a simple toggle router or use a library that reads flag values from configuration, environment variables, or a remote service",
      "Guard new code paths: wrap the new feature code behind conditional checks that consult the toggle router, keeping the old path as the default fallback",
      "Test both paths: write tests that exercise the code with the toggle on and off to ensure both branches behave correctly",
      "Clean up expired toggles: once a feature is fully rolled out, remove the toggle and its conditional branching to prevent toggle debt accumulation"
    ],
    "steps_zh": [
      "定义开关点：确定代码中行为应基于功能标志产生分歧的精确位置，尽量将其放在最高层级",
      "实现开关基础设施：创建简单的开关路由器或使用从配置、环境变量或远程服务读取标志值的库",
      "保护新代码路径：将新功能代码包裹在查询开关路由器的条件检查后面，保持旧路径作为默认回退",
      "测试两条路径：编写在开关打开和关闭状态下都执行代码的测试，确保两个分支行为正确",
      "清理过期开关：功能完全上线后，移除开关及其条件分支以防止开关债务累积"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Toggle Point",
      "Toggle Config",
      "Release Flag",
      "Kill Switch"
    ],
    "viz_labels_zh": [
      "开关点",
      "配置管理",
      "发布标志",
      "紧急关闭"
    ],
    "related": [
      "strangler-fig-at-code-level",
      "branch-by-abstraction",
      "feature-branch-strategy",
      "clean-code-principles"
    ],
    "tags": [
      "feature-toggles",
      "feature-flags",
      "branching-in-code",
      "conditional-logic",
      "trunk-based"
    ],
    "origin_author": "Martin Fowler, 2010; Pete Hodgson, 2017",
    "origin_source": "Refactoring (Fowler, 2018); Feature Toggles (Pete Hodgson, martinfowler.com)",
    "origin_source_zh": "《重构》（Fowler，2018）；《功能开关》（Pete Hodgson，martinfowler.com）",
    "complexity": "intermediate",
    "when_to_use": [
      "Practicing trunk-based development where long-lived feature branches are avoided",
      "Releasing partially complete features to production safely hidden behind a flag",
      "Performing A/B testing or gradual rollouts by toggling features for specific user segments",
      "Enabling operations teams to disable problematic features instantly without a code deployment"
    ],
    "when_to_use_zh": [
      "实践主干开发，避免长生命周期的功能分支",
      "将部分完成的功能安全地隐藏在标志后面发布到生产环境",
      "通过为特定用户群体切换功能来进行A/B测试或渐进式发布",
      "使运维团队能够无需代码部署即可即时禁用有问题的功能"
    ],
    "core_concepts": [
      "Release Toggle: a short-lived flag that hides incomplete features in production until they are ready for launch",
      "Experiment Toggle: a flag used for A/B testing that routes different users to different code paths to measure outcomes",
      "Ops Toggle: a long-lived flag that allows operations to degrade or disable features under load without redeploying",
      "Toggle Router: the decision engine that evaluates toggle state based on configuration, user context, or percentage rollout",
      "Toggle Debt: the technical debt that accumulates when expired toggles and their conditional branches are not cleaned up"
    ],
    "core_concepts_zh": [
      "发布开关：短生命周期的标志，在功能准备好发布前将未完成的功能隐藏在生产环境中",
      "实验开关：用于A/B测试的标志，将不同用户路由到不同代码路径以衡量结果",
      "运维开关：长生命周期的标志，允许运维在负载下降级或禁用功能而无需重新部署",
      "开关路由器：根据配置、用户上下文或百分比发布评估开关状态的决策引擎",
      "开关债务：过期开关及其条件分支未被清理时累积的技术债务"
    ],
    "timeline": [
      [
        "2010",
        "Martin Fowler publishes 'FeatureToggle' on his blog, defining the core concept of branching in code"
      ],
      [
        "2013",
        "Release toggles become standard practice in continuous delivery pipelines at companies like Flickr and Etsy"
      ],
      [
        "2017",
        "Pete Hodgson publishes a comprehensive taxonomy of feature toggle types on martinfowler.com"
      ],
      [
        "2020",
        "Feature flag platforms like LaunchDarkly and Unleash achieve widespread adoption, making toggle management a first-class concern"
      ]
    ],
    "timeline_zh": [
      [
        "2010",
        "Martin Fowler 在其博客上发表《FeatureToggle》，定义了代码中分支的核心概念"
      ],
      [
        "2013",
        "发布开关在 Flickr 和 Etsy 等公司的持续交付管道中成为标准实践"
      ],
      [
        "2017",
        "Pete Hodgson 在 martinfowler.com 上发表了功能开关类型的全面分类"
      ],
      [
        "2020",
        "LaunchDarkly 和 Unleash 等功能标志平台获得广泛采用，使开关管理成为一等关注点"
      ]
    ],
    "dos": [
      "Do place toggle points at the highest level possible because deeply nested toggles make code harder to reason about",
      "Do set expiration dates for release toggles because forgotten flags become permanent complexity",
      "Do test both toggle states in your test suite because untested paths will break silently in production",
      "Do use a toggle naming convention because consistent names make it easy to search for and audit active flags"
    ],
    "dos_zh": [
      "将开关点放在尽可能高的层级，因为深层嵌套的开关使代码更难理解",
      "为发布开关设置过期日期，因为被遗忘的标志会成为永久的复杂性",
      "在测试套件中测试两种开关状态，因为未测试的路径会在生产环境中默默失败",
      "使用开关命名约定，因为一致的名称便于搜索和审计活跃的标志"
    ],
    "donts": [
      "Don't nest multiple toggles in the same code path because the combinatorial explosion of states becomes untestable",
      "Don't use toggles as a permanent branching mechanism because they should be temporary by design",
      "Don't store toggle state only in code because it eliminates the ability to change flags without redeployment",
      "Don't skip toggle cleanup after rollout because accumulated toggle debt degrades codebase readability"
    ],
    "donts_zh": [
      "不要在同一代码路径中嵌套多个开关，因为状态的组合爆炸使其无法测试",
      "不要将开关用作永久分支机制，因为它们在设计上应该是临时的",
      "不要仅在代码中存储开关状态，因为这消除了无需重新部署即可更改标志的能力",
      "不要在发布后跳过开关清理，因为累积的开关债务会降低代码库的可读性"
    ],
    "case_study_company": "Etsy",
    "case_study": "Etsy pioneered the use of code-level feature toggles as part of their continuous deployment culture. With over 50 deployments per day, feature flags allowed engineers to merge incomplete features to trunk and deploy safely. Their internal 'Feature' API let developers toggle features per user, per percentage, or per employee group. When a new search ranking algorithm caused unexpected results, the ops team disabled it within seconds via a toggle without rolling back the deployment. Etsy's disciplined toggle cleanup process ensured that expired flags were removed within two weeks of full rollout.",
    "case_study_zh": "Etsy 作为其持续部署文化的一部分，率先使用了代码级功能开关。每天超过 50 次部署，功能标志允许工程师将未完成的功能合并到主干并安全部署。他们内部的 'Feature' API 让开发者可以按用户、按百分比或按员工组切换功能。当新的搜索排序算法导致意外结果时，运维团队在几秒内通过开关禁用了它，而无需回滚部署。Etsy 严格的开关清理流程确保过期标志在完全发布后两周内被移除。",
    "when_not_to_use": [
      "Simple applications with a single deployment target and no need for gradual rollout",
      "Code paths where the toggle conditional would add more complexity than the feature itself",
      "Performance-critical hot paths where even a simple conditional check is unacceptable overhead",
      "Teams without the discipline to clean up expired toggles, as toggle debt will degrade the codebase"
    ],
    "when_not_to_use_zh": [
      "只有单一部署目标且无需渐进式发布的简单应用",
      "开关条件会增加比功能本身更多复杂性的代码路径",
      "即使简单条件检查也会带来不可接受开销的性能关键热路径",
      "缺乏清理过期开关纪律的团队，因为开关债务会降低代码库质量"
    ],
    "adopters": [
      "Etsy",
      "Netflix",
      "Facebook",
      "Google",
      "LaunchDarkly"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Pete Hodgson (2017). \"Feature Toggles (aka Feature Flags)\". martinfowler.com.",
    "secondary_sources": [
      "Martin Fowler (2010). \"FeatureToggle\". martinfowler.com.",
      "Jez Humble and David Farley (2010). \"Continuous Delivery: Reliable Software Releases through Build, Test, and Deployment Automation\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "strangler-fig-at-code-level",
        "type": "complement"
      },
      {
        "slug": "branch-by-abstraction",
        "type": "complement"
      },
      {
        "slug": "feature-branch-strategy",
        "type": "alternative"
      },
      {
        "slug": "clean-code-principles",
        "type": "complement"
      }
    ]
  },
  {
    "id": 175,
    "name": "Immutability Pattern",
    "name_zh": "不可变性模式",
    "slug": "immutability-pattern",
    "category": "coding",
    "desc": "Prefer immutable data structures to eliminate shared mutable state and improve safety, concurrency, and reasoning",
    "desc_zh": "优先使用不可变数据结构以消除共享可变状态，提高安全性、并发性和可推理性",
    "steps": [
      "Default to immutable declarations: use const, final, readonly, or val by default; only use mutable bindings when mutation is explicitly required",
      "Design immutable data types: create classes or records where all fields are set at construction time and no setters or mutating methods are exposed",
      "Use transformation over mutation: instead of modifying an object in place, produce a new object with the desired changes using copy-on-write or builder patterns",
      "Leverage persistent data structures: for collections that change frequently, use persistent (structural sharing) data structures to avoid the cost of full copies",
      "Isolate mutable boundaries: when mutation is unavoidable (I/O, caches, buffers), confine it to clearly marked boundaries and keep the core logic purely immutable"
    ],
    "steps_zh": [
      "默认使用不可变声明：默认使用 const、final、readonly 或 val；仅在明确需要变更时使用可变绑定",
      "设计不可变数据类型：创建所有字段在构造时设定且不暴露 setter 或变更方法的类或记录",
      "用转换替代变更：不要就地修改对象，而是使用写时复制或构建器模式产生包含所需更改的新对象",
      "利用持久化数据结构：对于频繁变化的集合，使用持久化（结构共享）数据结构以避免完整复制的成本",
      "隔离可变边界：当变更不可避免时（I/O、缓存、缓冲区），将其限制在明确标记的边界内，保持核心逻辑纯不可变"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Immutable Value",
      "Pure Function",
      "New State",
      "Side Effects"
    ],
    "viz_labels_zh": [
      "不可变值",
      "纯函数",
      "新状态",
      "副作用"
    ],
    "related": [
      "functional-core-imperative-shell",
      "solid-principles",
      "clean-code-principles",
      "error-handling-patterns"
    ],
    "tags": [
      "immutability",
      "functional-programming",
      "concurrency",
      "data-structures",
      "thread-safety"
    ],
    "origin_author": "John Ousterhout, 2018; Rich Hickey, 2007",
    "origin_source": "A Philosophy of Software Design (Ousterhout, 2018); Refactoring (Fowler, 2018)",
    "origin_source_zh": "《软件设计的哲学》（Ousterhout，2018）；《重构》（Fowler，2018）",
    "complexity": "intermediate",
    "when_to_use": [
      "Building concurrent or multi-threaded systems where shared mutable state causes race conditions",
      "Designing domain models where objects should not change after creation, such as events, transactions, or value objects",
      "Implementing undo/redo or time-travel debugging by preserving previous states",
      "Working in functional programming paradigms where immutability is a foundational principle"
    ],
    "when_to_use_zh": [
      "构建共享可变状态导致竞态条件的并发或多线程系统",
      "设计创建后不应更改的领域模型，如事件、事务或值对象",
      "通过保留先前状态实现撤销/重做或时间旅行调试",
      "在不可变性是基础原则的函数式编程范式中工作"
    ],
    "core_concepts": [
      "Immutable Object: an object whose state cannot be changed after construction; any 'modification' produces a new object",
      "Value Object: a domain concept defined by its attributes rather than identity, naturally suited to immutability",
      "Persistent Data Structure: a data structure that preserves its previous versions when modified, using structural sharing to maintain efficiency",
      "Copy-on-Write: a strategy where data is shared by default and only copied when a mutation is requested, balancing performance and immutability",
      "Mutable Boundary: the thin outer layer of a system (I/O, database, UI) where mutation is contained, keeping the core logic free of side effects"
    ],
    "core_concepts_zh": [
      "不可变对象：构造后状态不可更改的对象；任何'修改'都产生一个新对象",
      "值对象：由其属性而非身份定义的领域概念，天然适合不可变性",
      "持久化数据结构：修改时保留先前版本的数据结构，使用结构共享来保持效率",
      "写时复制：默认共享数据，仅在请求变更时才复制的策略，平衡性能与不可变性",
      "可变边界：系统中包含变更的薄外层（I/O、数据库、UI），保持核心逻辑无副作用"
    ],
    "timeline": [
      [
        "1966",
        "Immutable data concepts emerge in early Lisp programming with cons cells and persistent lists"
      ],
      [
        "2007",
        "Rich Hickey creates Clojure, bringing persistent immutable data structures to the JVM mainstream"
      ],
      [
        "2015",
        "Facebook releases Immutable.js, popularizing persistent data structures in JavaScript"
      ],
      [
        "2018",
        "Ousterhout's 'A Philosophy of Software Design' advocates reducing complexity through immutability and deep modules"
      ]
    ],
    "timeline_zh": [
      [
        "1966",
        "不可变数据概念在早期 Lisp 编程中随 cons 单元和持久列表出现"
      ],
      [
        "2007",
        "Rich Hickey 创建 Clojure，将持久不可变数据结构带入 JVM 主流"
      ],
      [
        "2015",
        "Facebook 发布 Immutable.js，在 JavaScript 中推广持久化数据结构"
      ],
      [
        "2018",
        "Ousterhout 的《软件设计的哲学》倡导通过不可变性和深模块减少复杂性"
      ]
    ],
    "dos": [
      "Do default to immutable declarations in every language you work in because mutable state should be a conscious choice, not the default",
      "Do use builder or 'with' patterns when constructing modified copies because they keep immutable APIs ergonomic",
      "Do leverage persistent data structures for frequently updated collections because they avoid the overhead of full copies",
      "Do document mutable boundaries explicitly because the team needs to know where side effects live"
    ],
    "dos_zh": [
      "在你使用的每种语言中默认使用不可变声明，因为可变状态应该是有意识的选择而非默认",
      "在构造修改后的副本时使用构建器或 'with' 模式，因为它们使不可变 API 更符合人体工学",
      "对频繁更新的集合利用持久化数据结构，因为它们避免了完整复制的开销",
      "明确记录可变边界，因为团队需要知道副作用存在于哪里"
    ],
    "donts": [
      "Don't make everything immutable blindly in performance-critical code because excessive copying can degrade throughput",
      "Don't expose mutable internals from an otherwise immutable object because a single leaked reference breaks the guarantee",
      "Don't ignore the cost of deep copying large object graphs because structural sharing or copy-on-write should be used instead",
      "Don't forget to make collections immutable too because an immutable object with a mutable list field is not truly immutable"
    ],
    "donts_zh": [
      "不要在性能关键代码中盲目使一切不可变，因为过度复制会降低吞吐量",
      "不要从本应不可变的对象中暴露可变内部，因为一个泄露的引用就会打破保证",
      "不要忽视深复制大型对象图的成本，因为应使用结构共享或写时复制替代",
      "不要忘记也使集合不可变，因为含可变列表字段的不可变对象并非真正不可变"
    ],
    "case_study_company": "Walmart Labs",
    "case_study": "Walmart Labs adopted immutability as a core principle in their React/Redux frontend architecture serving walmart.com. By treating the Redux store as an immutable state tree and using Immutable.js for all data structures, they eliminated an entire class of bugs caused by accidental state mutation in event handlers. Time-travel debugging became trivial, and their server-side rendering pipeline became thread-safe without locks. The migration reduced frontend bugs by 40% in the first quarter after adoption.",
    "case_study_zh": "Walmart Labs 在其服务于 walmart.com 的 React/Redux 前端架构中将不可变性作为核心原则。通过将 Redux store 视为不可变状态树并对所有数据结构使用 Immutable.js，他们消除了事件处理程序中意外状态变更引起的整类 bug。时间旅行调试变得轻而易举，服务端渲染管道无需锁即可线程安全。迁移在采用后的第一季度将前端 bug 减少了 40%。",
    "when_not_to_use": [
      "High-performance systems processing millions of small objects per second where allocation pressure matters",
      "Embedded or memory-constrained environments where object duplication is prohibitively expensive",
      "Legacy codebases deeply built around mutable state where introducing immutability would require a full rewrite",
      "Simple CRUD applications where the overhead of immutable patterns provides no meaningful benefit"
    ],
    "when_not_to_use_zh": [
      "每秒处理数百万小对象且分配压力重要的高性能系统",
      "对象复制成本过高的嵌入式或内存受限环境",
      "深度围绕可变状态构建的遗留代码库，引入不可变性需要完全重写",
      "不可变模式的开销不会带来有意义收益的简单 CRUD 应用"
    ],
    "adopters": [
      "Walmart Labs",
      "Facebook",
      "Netflix",
      "Airbnb",
      "Nubank"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Rich Hickey (2007). \"Clojure: A Dynamic Programming Language for the JVM\". clojure.org.",
    "secondary_sources": [
      "John Ousterhout (2018). \"A Philosophy of Software Design\". Yaknyam Press.",
      "Martin Fowler (2018). \"Refactoring: Improving the Design of Existing Code, 2nd Edition\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "functional-core-imperative-shell",
        "type": "complement"
      },
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "clean-code-principles",
        "type": "complement"
      },
      {
        "slug": "error-handling-patterns",
        "type": "complement"
      }
    ]
  },
  {
    "id": 176,
    "name": "Null Object Pattern",
    "name_zh": "空对象模式",
    "slug": "null-object-pattern",
    "category": "coding",
    "desc": "Eliminate null checks by providing default-behavior objects that implement the expected interface with no-op or safe defaults",
    "desc_zh": "通过提供实现预期接口的默认行为对象（无操作或安全默认值）来消除空值检查",
    "steps": [
      "Identify pervasive null checks: locate code where null or nil checks are scattered throughout to guard against missing collaborators or absent data",
      "Define the interface or base type: ensure the object being checked for null implements a clear interface or abstract type that defines the expected behavior",
      "Create the Null Object class: implement the interface with a class that provides safe, neutral default behavior — methods return empty collections, zero values, or simply do nothing",
      "Replace null returns with Null Objects: modify factories, repositories, or lookup methods to return the Null Object instead of null when no real object is found",
      "Remove the null checks: with the Null Object in place, delete the defensive null checks from calling code, simplifying control flow and reducing cyclomatic complexity"
    ],
    "steps_zh": [
      "识别普遍的空值检查：定位代码中为防范缺失的协作者或数据而散布的 null 或 nil 检查",
      "定义接口或基类型：确保被检查空值的对象实现了定义预期行为的清晰接口或抽象类型",
      "创建空对象类：用提供安全、中性默认行为的类实现接口 — 方法返回空集合、零值或简单地什么都不做",
      "用空对象替换空值返回：修改工厂、仓库或查找方法，在找不到真实对象时返回空对象而非 null",
      "移除空值检查：有了空对象之后，从调用代码中删除防御性空值检查，简化控制流并降低圈复杂度"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Client",
      "Abstract Type",
      "Real Object",
      "Null Object"
    ],
    "viz_labels_zh": [
      "客户端",
      "抽象类型",
      "真实对象",
      "空对象"
    ],
    "related": [
      "design-by-contract",
      "clean-code-principles",
      "solid-principles",
      "error-handling-patterns"
    ],
    "tags": [
      "null-object",
      "design-pattern",
      "defensive-programming",
      "null-safety",
      "polymorphism"
    ],
    "origin_author": "Bobby Woolf, 1998; Martin Fowler, 2018",
    "origin_source": "Refactoring (Fowler, 2018); Working Effectively with Legacy Code (Feathers, 2004)",
    "origin_source_zh": "《重构》（Fowler，2018）；《修改代码的艺术》（Feathers，2004）",
    "complexity": "beginner",
    "when_to_use": [
      "Code littered with repetitive null checks for the same type, reducing readability and increasing cyclomatic complexity",
      "Providing default or guest behavior in systems with optional collaborators, such as a NullLogger or GuestUser",
      "Legacy codebases where null returns from methods cause frequent NullPointerException or TypeError crashes",
      "Simplifying test code by using Null Objects as safe stand-ins instead of complex mocking setups"
    ],
    "when_to_use_zh": [
      "代码中充斥着对同一类型的重复空值检查，降低可读性并增加圈复杂度",
      "在具有可选协作者的系统中提供默认或访客行为，如 NullLogger 或 GuestUser",
      "方法返回 null 导致频繁 NullPointerException 或 TypeError 崩溃的遗留代码库",
      "通过使用空对象作为安全替身来简化测试代码，替代复杂的 mock 设置"
    ],
    "core_concepts": [
      "Null Object: a concrete implementation of an interface that provides neutral, do-nothing behavior as a stand-in for null",
      "Polymorphic Dispatch: using the type system to route calls to the Null Object's methods instead of guarding with conditionals",
      "Default Behavior: the safe, side-effect-free actions the Null Object performs, such as returning empty strings, empty lists, or zero",
      "Special Case Pattern: Martin Fowler's generalization where any special condition (not just null) is handled by a dedicated object rather than conditionals",
      "Sentinel Value Elimination: replacing magic null values with typed objects that communicate intent and prevent errors"
    ],
    "core_concepts_zh": [
      "空对象：接口的具体实现，提供中性的无操作行为作为 null 的替身",
      "多态分派：使用类型系统将调用路由到空对象的方法，而非用条件语句防护",
      "默认行为：空对象执行的安全、无副作用的操作，如返回空字符串、空列表或零",
      "特殊情况模式：Martin Fowler 的泛化概念，任何特殊条件（不仅是 null）都由专用对象而非条件语句处理",
      "哨兵值消除：用传达意图并防止错误的类型化对象替换魔术空值"
    ],
    "timeline": [
      [
        "1998",
        "Bobby Woolf presents the Null Object Pattern at PLoP (Pattern Languages of Programs) conference"
      ],
      [
        "2004",
        "Michael Feathers describes using Null Objects to break dependencies in legacy code in 'Working Effectively with Legacy Code'"
      ],
      [
        "2018",
        "Martin Fowler includes 'Introduce Special Case' (a generalization of Null Object) as a key refactoring in 'Refactoring' second edition"
      ],
      [
        "2020",
        "Modern languages like Kotlin, Swift, and Rust reduce the need for Null Object via built-in null safety (Option/Optional types)"
      ]
    ],
    "timeline_zh": [
      [
        "1998",
        "Bobby Woolf 在 PLoP（程序模式语言）会议上提出空对象模式"
      ],
      [
        "2004",
        "Michael Feathers 在《修改代码的艺术》中描述了使用空对象来打破遗留代码中的依赖"
      ],
      [
        "2018",
        "Martin Fowler 在《重构》第二版中将「引入特殊情况」（空对象的泛化）作为关键重构手法"
      ],
      [
        "2020",
        "Kotlin、Swift 和 Rust 等现代语言通过内置空安全（Option/Optional 类型）减少了对空对象的需求"
      ]
    ],
    "dos": [
      "Do make the Null Object implement the same interface as the real object because polymorphism is what makes the pattern work",
      "Do make the Null Object a singleton when possible because there is no reason to create multiple instances of identical default behavior",
      "Do name Null Objects clearly (e.g., NullLogger, GuestUser, MissingCustomer) because the name should communicate its role",
      "Do consider using language-level null safety features (Optional, Option) alongside Null Objects because they complement each other"
    ],
    "dos_zh": [
      "让空对象实现与真实对象相同的接口，因为多态性是该模式生效的关键",
      "尽可能将空对象设为单例，因为没有理由创建多个相同默认行为的实例",
      "清晰命名空对象（如 NullLogger、GuestUser、MissingCustomer），因为名称应传达其角色",
      "考虑将语言级空安全特性（Optional、Option）与空对象结合使用，因为它们互为补充"
    ],
    "donts": [
      "Don't hide errors behind Null Objects because sometimes a null indicates a genuine bug that should fail loudly",
      "Don't use Null Objects when the absence of a value carries important business meaning because silently swallowing it masks logic errors",
      "Don't proliferate Null Object classes for every type in the system because it creates unnecessary boilerplate",
      "Don't let the Null Object perform side effects because its purpose is to be inert and safe"
    ],
    "donts_zh": [
      "不要将错误隐藏在空对象后面，因为有时 null 表示应该大声失败的真正 bug",
      "当值的缺失具有重要的业务含义时不要使用空对象，因为静默吞咽会掩盖逻辑错误",
      "不要为系统中每种类型都创建空对象类，因为这会产生不必要的样板代码",
      "不要让空对象执行副作用，因为它的目的是惰性和安全的"
    ],
    "case_study_company": "JetBrains",
    "case_study": "JetBrains uses the Null Object Pattern extensively throughout the IntelliJ IDEA codebase. For example, PsiElement (the core AST node interface) has a NullPsiElement that is returned when resolution fails, avoiding thousands of null checks in code inspections and refactoring tools. Similarly, their VirtualFile abstraction uses a NullVirtualFile to represent absent files. This pattern allowed IntelliJ's plugin ecosystem to remain stable even when plugins encounter missing elements, because the Null Objects provide safe defaults rather than throwing NullPointerExceptions.",
    "case_study_zh": "JetBrains 在 IntelliJ IDEA 代码库中广泛使用空对象模式。例如，PsiElement（核心 AST 节点接口）有一个 NullPsiElement，在解析失败时返回，避免了代码检查和重构工具中数千处空值检查。类似地，他们的 VirtualFile 抽象使用 NullVirtualFile 来表示不存在的文件。这种模式使 IntelliJ 的插件生态系统即使在插件遇到缺失元素时也保持稳定，因为空对象提供安全默认值而非抛出 NullPointerException。",
    "when_not_to_use": [
      "When null genuinely indicates an error condition that should throw an exception or halt execution",
      "Languages with robust built-in null safety (Kotlin, Rust, Swift) where Option/Optional types are more idiomatic",
      "Simple scripts or small programs where a few null checks are clearer than introducing a new class",
      "Cases where the absence of a value has distinct business meaning that must be explicitly handled"
    ],
    "when_not_to_use_zh": [
      "当 null 真正表示应抛出异常或终止执行的错误条件时",
      "具有健壮内置空安全的语言（Kotlin、Rust、Swift），Option/Optional 类型更符合惯用法",
      "简单脚本或小程序，几处空值检查比引入新类更清晰",
      "值的缺失具有必须明确处理的不同业务含义的情况"
    ],
    "adopters": [
      "JetBrains",
      "Google (Guava)",
      "Apache Commons",
      "Spring Framework",
      "Eclipse Foundation"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Bobby Woolf (1998). \"Null Object\". Pattern Languages of Program Design 3. Addison-Wesley.",
    "secondary_sources": [
      "Martin Fowler (2018). \"Refactoring: Improving the Design of Existing Code, 2nd Edition\". Addison-Wesley.",
      "Michael Feathers (2004). \"Working Effectively with Legacy Code\". Prentice Hall."
    ],
    "typed_relations": [
      {
        "slug": "design-by-contract",
        "type": "complement"
      },
      {
        "slug": "clean-code-principles",
        "type": "complement"
      },
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "error-handling-patterns",
        "type": "complement"
      }
    ]
  },
  {
    "id": 177,
    "name": "Type-Driven Design",
    "name_zh": "类型驱动设计",
    "slug": "type-driven-design",
    "category": "coding",
    "desc": "Use the type system to encode business rules and constraints, making invalid states unrepresentable at compile time",
    "desc_zh": "利用类型系统编码业务规则和约束，使无效状态在编译时不可表示",
    "steps": [
      "Model the domain with types: replace primitive types (strings, integers) with domain-specific types (EmailAddress, OrderId, PositiveInt) that carry meaning and enforce constraints at construction time",
      "Make illegal states unrepresentable: use union types, enums, or sealed classes to ensure the type system only permits valid combinations of state — for example, an Order that is either Draft, Confirmed, or Shipped, each carrying only the data relevant to that state",
      "Encode state transitions in types: design the API so that operations are only available on the correct state — a Confirmed order has a ship() method, but a Draft order does not",
      "Use types to enforce preconditions: replace runtime validation checks with types that guarantee validity at construction — if a function requires a NonEmptyList, callers cannot pass an empty list",
      "Refactor toward richer types: continuously look for primitives, optionals, or stringly-typed data that could be replaced by more expressive types, narrowing the space of possible bugs"
    ],
    "steps_zh": [
      "用类型建模领域：用携带含义并在构造时强制约束的领域特定类型（EmailAddress、OrderId、PositiveInt）替换原始类型（字符串、整数）",
      "使非法状态不可表示：使用联合类型、枚举或密封类确保类型系统只允许有效的状态组合 — 例如，订单只能是草稿、已确认或已发货，每种状态只携带与该状态相关的数据",
      "在类型中编码状态转换：设计 API 使操作仅在正确状态下可用 — 已确认的订单有 ship() 方法，但草稿订单没有",
      "用类型强制前置条件：用在构造时保证有效性的类型替换运行时验证检查 — 如果函数需要 NonEmptyList，调用方就无法传递空列表",
      "向更丰富的类型重构：持续寻找可以用更具表现力的类型替代的原始类型、可选值或字符串化数据，缩小可能的 bug 空间"
    ],
    "ai_relevant": false,
    "viz_type": "pyramid",
    "viz_labels": [
      "Domain Type",
      "Constraint",
      "Type Safety",
      "Compiler"
    ],
    "viz_labels_zh": [
      "领域类型",
      "类型约束",
      "类型安全",
      "编译器"
    ],
    "related": [
      "design-by-contract",
      "domain-driven-design",
      "clean-code-principles",
      "immutability-pattern"
    ],
    "tags": [
      "type-system",
      "type-safety",
      "domain-modeling",
      "compile-time-safety",
      "make-illegal-states-unrepresentable"
    ],
    "origin_author": "John Ousterhout, 2018; Scott Wlaschin, 2018",
    "origin_source": "A Philosophy of Software Design (Ousterhout, 2018); Refactoring (Fowler, 2018)",
    "origin_source_zh": "《软件设计的哲学》（Ousterhout，2018）；《重构》（Fowler，2018）",
    "complexity": "advanced",
    "when_to_use": [
      "Modeling complex business domains where invalid state combinations cause subtle, hard-to-detect bugs",
      "Building safety-critical systems where runtime errors are unacceptable and correctness must be guaranteed at compile time",
      "Working in strongly typed languages (Rust, TypeScript, Kotlin, Haskell, F#) where the type system is expressive enough to encode constraints",
      "Refactoring primitive-obsessed codebases where strings and integers are used to represent domain concepts"
    ],
    "when_to_use_zh": [
      "建模复杂业务领域，其中无效状态组合会导致微妙且难以检测的 bug",
      "构建安全关键系统，运行时错误不可接受，正确性必须在编译时得到保证",
      "在强类型语言（Rust、TypeScript、Kotlin、Haskell、F#）中工作，其类型系统足够表达性以编码约束",
      "重构原始类型执念的代码库，其中字符串和整数被用来表示领域概念"
    ],
    "core_concepts": [
      "Make Illegal States Unrepresentable: design types so that the compiler rejects invalid combinations of data, eliminating entire categories of runtime errors",
      "Domain-Specific Types: replacing primitives (string, int) with named types (EmailAddress, CustomerId) that carry meaning and enforce invariants",
      "Phantom Types: type parameters that exist only at compile time to track state or capabilities without runtime overhead",
      "Smart Constructor: a constructor that validates input and returns the typed value only if invariants are met, ensuring all instances are valid by construction",
      "Type-State Pattern: encoding object lifecycle states as distinct types so that operations are only available in valid states"
    ],
    "core_concepts_zh": [
      "使非法状态不可表示：设计类型使编译器拒绝无效的数据组合，消除整类运行时错误",
      "领域特定类型：用携带含义并强制不变量的命名类型（EmailAddress、CustomerId）替换原始类型（string、int）",
      "幽灵类型：仅在编译时存在的类型参数，用于追踪状态或能力而无运行时开销",
      "智能构造器：验证输入并仅在满足不变量时返回类型化值的构造器，确保所有实例在构造时即有效",
      "类型状态模式：将对象生命周期状态编码为不同类型，使操作仅在有效状态下可用"
    ],
    "timeline": [
      [
        "1999",
        "Cardelli and Wegner's type theory foundations influence the idea that types can encode semantic constraints"
      ],
      [
        "2013",
        "Yaron Minsky coins 'Make Illegal States Unrepresentable' in the OCaml community, crystallizing the philosophy"
      ],
      [
        "2018",
        "Ousterhout's 'A Philosophy of Software Design' emphasizes using deep interfaces and rich types to reduce complexity"
      ],
      [
        "2021",
        "TypeScript 4.1+ template literal types and Rust's type-state pattern bring type-driven design to mainstream languages"
      ]
    ],
    "timeline_zh": [
      [
        "1999",
        "Cardelli 和 Wegner 的类型理论基础影响了类型可以编码语义约束的思想"
      ],
      [
        "2013",
        "Yaron Minsky 在 OCaml 社区中提出「使非法状态不可表示」，将该理念结晶化"
      ],
      [
        "2018",
        "Ousterhout 的《软件设计的哲学》强调使用深接口和丰富类型来减少复杂性"
      ],
      [
        "2021",
        "TypeScript 4.1+ 模板字面量类型和 Rust 的类型状态模式将类型驱动设计带入主流语言"
      ]
    ],
    "dos": [
      "Do wrap primitives in domain types because a CustomerId should never be accidentally used as an OrderId",
      "Do model state machines with distinct types for each state because it makes invalid transitions a compile error",
      "Do use smart constructors that validate on creation because all instances of the type are then guaranteed valid",
      "Do start small by typing the most error-prone primitives first because incremental adoption is more sustainable"
    ],
    "dos_zh": [
      "将原始类型包装在领域类型中，因为 CustomerId 永远不应被意外用作 OrderId",
      "用每个状态的不同类型建模状态机，因为这使无效转换成为编译错误",
      "使用在创建时验证的智能构造器，因为该类型的所有实例都能保证有效",
      "从最容易出错的原始类型开始小规模推进，因为渐进式采用更可持续"
    ],
    "donts": [
      "Don't over-type trivial code because wrapping every string in a newtype for a simple script adds friction with no benefit",
      "Don't ignore ergonomics because if typed APIs are painful to use, developers will work around them",
      "Don't rely solely on types for validation that also needs runtime enforcement because types alone cannot check external input from users or APIs",
      "Don't introduce phantom types or advanced type-level programming in teams unfamiliar with them because it creates a steep learning curve"
    ],
    "donts_zh": [
      "不要对简单代码过度类型化，因为在简单脚本中将每个字符串包装在 newtype 中只会增加摩擦而无收益",
      "不要忽视人体工学，因为如果类型化 API 使用痛苦，开发者会绕过它们",
      "不要仅依赖类型来进行也需要运行时强制的验证，因为类型本身无法检查来自用户或 API 的外部输入",
      "不要在不熟悉它们的团队中引入幽灵类型或高级类型级编程，因为这会造成陡峭的学习曲线"
    ],
    "case_study_company": "Jane Street",
    "case_study": "Jane Street, one of the largest quantitative trading firms, built their entire trading infrastructure in OCaml using type-driven design. Every financial instrument, currency, and quantity is wrapped in domain-specific types — a Dollar amount cannot be accidentally added to a Euro amount, and a buy order type cannot be passed where a sell order is expected. This approach has prevented costly trading errors that plague firms using stringly-typed or loosely-typed systems. Their open-source libraries (Core, Async) demonstrate this philosophy, and they credit type-driven design with enabling their small engineering team to manage billions of dollars in daily trading volume with remarkably few production bugs.",
    "case_study_zh": "Jane Street 作为最大的量化交易公司之一，使用类型驱动设计在 OCaml 中构建了整个交易基础设施。每种金融工具、货币和数量都包装在领域特定类型中 — 美元金额不能意外地与欧元金额相加，买入订单类型不能传递到期望卖出订单的地方。这种方法防止了困扰使用字符串化或弱类型系统的公司的代价高昂的交易错误。他们的开源库（Core、Async）展示了这一理念，他们将类型驱动设计归功于使其小型工程团队能够以极少的生产 bug 管理每日数十亿美元的交易量。",
    "case_study_challenge": "In quantitative trading, a single type confusion — adding a Dollar amount to a Euro amount, or passing a buy order where a sell order is expected — can trigger losses measured in millions before anyone notices. Firms using stringly-typed or loosely-typed systems suffered recurring costly trading errors that no amount of testing could fully prevent.",
    "case_study_challenge_zh": "在量化交易中，一次类型混淆——将美元金额与欧元金额相加，或在期望卖单的地方传入买单——就可能在无人察觉之前触发数百万的损失。使用字符串化或弱类型系统的公司反复遭受代价高昂的交易错误，无论怎样测试都无法完全避免。",
    "case_study_approach": "Jane Street built their entire trading infrastructure in OCaml, wrapping every financial instrument, currency, and quantity in domain-specific types. The compiler itself became the first line of defense: illegal operations simply cannot compile. Their open-source libraries Core and Async embody this philosophy, encoding business rules directly in the type system.",
    "case_study_approach_zh": "Jane Street用OCaml构建了整个交易基础设施，将每种金融工具、货币和数量都包装在领域特定类型中。编译器本身成为第一道防线：非法操作根本无法通过编译。他们的开源库Core和Async体现了这一理念，将业务规则直接编码在类型系统中。",
    "case_study_result": "A small engineering team manages billions of dollars in daily trading volume with remarkably few production bugs. The type system catches entire categories of errors — unit mismatches, currency confusion, order direction mistakes — at compile time, long before code reaches production.",
    "case_study_result_zh": "一支精干的工程团队以极少的生产缺陷管理着每日数十亿美元的交易量。类型系统在编译期就能捕获整类错误——单位不匹配、货币混淆、订单方向错误——远在代码进入生产环境之前。",
    "case_study_quote": "If it compiles, it trades correctly. We moved the cost of catching errors from runtime — where they cost millions — to compile time, where they cost seconds.",
    "case_study_quote_zh": "如果它能编译通过，它就能正确交易。我们把捕获错误的代价从运行时——可能损失数百万——转移到了编译期，只需数秒。",
    "when_not_to_use": [
      "Dynamically typed languages (Python, Ruby, JavaScript) where the type system is too weak to encode meaningful constraints",
      "Rapid prototyping phases where the domain model is still being discovered and types would need constant reshaping",
      "Small scripts or one-off utilities where the overhead of custom types exceeds the benefit",
      "Teams with limited experience in advanced type systems where the learning curve would slow delivery"
    ],
    "when_not_to_use_zh": [
      "动态类型语言（Python、Ruby、JavaScript），其类型系统太弱无法编码有意义的约束",
      "领域模型仍在探索中、类型需要不断重塑的快速原型阶段",
      "自定义类型的开销超过收益的小脚本或一次性工具",
      "高级类型系统经验有限、学习曲线会拖慢交付的团队"
    ],
    "adopters": [
      "Jane Street",
      "Meta (Flow/Hack)",
      "Microsoft (TypeScript)",
      "Jet.com (F#)",
      "Mozilla (Rust)"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Scott Wlaschin (2018). \"Domain Modeling Made Functional\". Pragmatic Bookshelf.",
    "secondary_sources": [
      "John Ousterhout (2018). \"A Philosophy of Software Design\". Yaknyam Press.",
      "Benjamin Pierce (2002). \"Types and Programming Languages\". MIT Press."
    ],
    "typed_relations": [
      {
        "slug": "design-by-contract",
        "type": "complement"
      },
      {
        "slug": "domain-driven-design",
        "type": "complement"
      },
      {
        "slug": "clean-code-principles",
        "type": "complement"
      },
      {
        "slug": "immutability-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 225,
    "name": "Strategy Pattern",
    "name_zh": "策略模式",
    "slug": "strategy-pattern",
    "category": "coding",
    "desc": "Encapsulate interchangeable algorithms behind a common interface",
    "desc_zh": "将可互换的算法封装在统一接口后，使算法可独立于使用方变化",
    "steps": [
      "Define the Strategy interface: declare a single method (or small set) that all algorithm variants must implement, establishing the contract",
      "Implement Concrete Strategies: create one class per algorithm variant, each encapsulating its logic behind the shared interface",
      "Build the Context class: hold a reference to a Strategy interface and delegate the algorithm call to it, keeping Context free of conditional logic",
      "Inject or swap strategies at runtime: pass the desired strategy via constructor or setter so callers can change behavior without modifying Context",
      "Test strategies independently: unit-test each concrete strategy in isolation, then test Context with mock strategies to verify delegation"
    ],
    "steps_zh": [
      "定义策略接口：声明所有算法变体必须实现的单一方法（或小型方法集），建立契约",
      "实现具体策略：为每种算法变体创建一个类，各自将逻辑封装在共享接口后",
      "构建上下文类：持有策略接口的引用，并将算法调用委托给它，使上下文不含条件判断逻辑",
      "在运行时注入或切换策略：通过构造函数或 setter 传入所需策略，调用方无需修改上下文即可改变行为",
      "独立测试各策略：对每个具体策略进行单元测试，再用模拟策略测试上下文以验证委托行为"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Context",
      "Strategy Interface",
      "Concrete Strategy",
      "Client"
    ],
    "viz_labels_zh": [
      "上下文",
      "策略接口",
      "具体策略",
      "客户端"
    ],
    "related": [
      "solid-principles",
      "command-pattern",
      "dependency-injection"
    ],
    "tags": [
      "gof",
      "behavioral",
      "design-patterns",
      "algorithm",
      "oop"
    ],
    "origin_author": "Gamma, Helm, Johnson, Vlissides, 1994",
    "origin_source": "Design Patterns: Elements of Reusable Object-Oriented Software",
    "origin_source_zh": "《设计模式：可复用面向对象软件的基础》",
    "complexity": "intermediate",
    "when_to_use": [
      "Multiple variants of an algorithm exist and you want to switch between them at runtime without conditionals",
      "You need to isolate business rules or policies so they can be tested and replaced independently",
      "An AI agent needs to select different planning, search, or scoring strategies dynamically"
    ],
    "when_to_use_zh": [
      "存在多种算法变体，希望在运行时无需条件判断即可切换",
      "需要将业务规则或策略隔离，以便独立测试和替换",
      "AI 智能体需要动态选择不同的规划、搜索或评分策略"
    ],
    "core_concepts": [
      "Strategy interface: the contract that all algorithm variants implement, enabling polymorphic substitution",
      "Context: the object that holds a strategy reference and delegates the variable part of its behavior to it",
      "Composition over inheritance: behavior varies by swapping whole objects rather than by subclassing Context"
    ],
    "core_concepts_zh": [
      "策略接口：所有算法变体实现的契约，支持多态替换",
      "上下文：持有策略引用并将可变行为委托给策略的对象",
      "组合优于继承：通过替换整个对象而非对上下文子类化来改变行为"
    ],
    "timeline": [
      [
        "1994",
        "GoF publish 'Design Patterns', introducing Strategy as one of 23 canonical patterns"
      ],
      [
        "2000",
        "Strategy becomes a foundation of the Policy Object pattern in enterprise Java frameworks"
      ],
      [
        "2010",
        "Functional languages popularize Strategy via first-class functions and lambdas, reducing boilerplate"
      ],
      [
        "2020",
        "Strategy widely adopted in ML pipelines to swap preprocessing, model, and evaluation components"
      ]
    ],
    "timeline_zh": [
      [
        "1994",
        "GoF 出版《设计模式》，将策略模式作为 23 个经典模式之一引入"
      ],
      [
        "2000",
        "策略模式成为企业 Java 框架中策略对象模式的基础"
      ],
      [
        "2010",
        "函数式语言通过一等函数和 lambda 推广策略模式，减少样板代码"
      ],
      [
        "2020",
        "策略模式在 ML 流水线中广泛应用，用于替换预处理、模型和评估组件"
      ]
    ],
    "dos": [
      "Do keep the strategy interface small and focused so each variant is easy to implement and test",
      "Do inject strategies via constructors to make dependencies explicit and enable straightforward unit testing",
      "Do use Strategy whenever you find yourself writing if-else or switch on a type enum to pick an algorithm"
    ],
    "dos_zh": [
      "保持策略接口小而专注，使每个变体易于实现和测试",
      "通过构造函数注入策略，使依赖关系明确，便于单元测试",
      "每当发现自己用 if-else 或 switch 枚举类型来选择算法时，就应使用策略模式"
    ],
    "donts": [
      "Don't use Strategy for a single algorithm with no anticipated variation because the indirection adds complexity without benefit",
      "Don't allow strategies to reach into the Context's internals because it creates hidden coupling between the two",
      "Don't create a new Strategy subclass for trivial one-line variants in languages with lambdas — pass a function instead"
    ],
    "donts_zh": [
      "不要对没有预期变化的单一算法使用策略模式，因为间接层只增加复杂性而无益处",
      "不要让策略访问上下文的内部状态，因为这会在两者之间造成隐性耦合",
      "在支持 lambda 的语言中，不要为简单的单行变体创建策略子类——直接传入函数即可"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix's streaming playback engine uses the Strategy pattern to select adaptive bitrate (ABR) algorithms. Different strategies (buffer-based, throughput-based, reinforcement-learning-based) implement a common ABR interface. The playback Context selects the appropriate strategy based on device type and network conditions at runtime, allowing Netflix to A/B test new algorithms without touching the core player code.",
    "case_study_zh": "Netflix 的流媒体播放引擎使用策略模式来选择自适应码率（ABR）算法。不同的策略（基于缓冲区、基于吞吐量、基于强化学习）实现统一的 ABR 接口。播放上下文在运行时根据设备类型和网络条件选择合适的策略，使 Netflix 无需修改核心播放器代码即可对新算法进行 A/B 测试。",
    "when_not_to_use": [
      "When there is truly only one algorithm and no realistic chance of needing to swap it",
      "Simple scripts or small utilities where the overhead of interface + multiple classes is disproportionate",
      "When a language supports first-class functions, a plain function parameter is often cleaner than a Strategy class"
    ],
    "when_not_to_use_zh": [
      "只有一种算法且没有切换需求时",
      "简单脚本或小型工具，接口加多个类的开销不成比例",
      "当语言支持一等函数时，普通函数参数通常比策略类更简洁"
    ],
    "adopters": [
      "Netflix",
      "Spring Framework",
      "Java Collections (Comparator)",
      "scikit-learn"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Gamma, E., Helm, R., Johnson, R., Vlissides, J. (1994). \"Design Patterns: Elements of Reusable Object-Oriented Software\". Addison-Wesley.",
    "secondary_sources": [
      "Freeman, E., Robson, E. (2004). \"Head First Design Patterns\". O'Reilly Media.",
      "Martin, R. C. (2002). \"Agile Software Development, Principles, Patterns, and Practices\". Prentice Hall."
    ],
    "typed_relations": [
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "command-pattern",
        "type": "related"
      },
      {
        "slug": "dependency-injection",
        "type": "complement"
      }
    ]
  },
  {
    "id": 226,
    "name": "Observer Pattern",
    "name_zh": "观察者模式",
    "slug": "observer-pattern",
    "category": "coding",
    "desc": "Notify dependents automatically when state changes",
    "desc_zh": "当对象状态发生变化时自动通知所有依赖方",
    "steps": [
      "Define the Subject interface: expose methods to attach, detach, and notify observers, keeping the subject independent of concrete observer types",
      "Define the Observer interface: declare an update method that subjects will call, carrying relevant state or event data",
      "Implement the Concrete Subject: maintain a list of observers, manage state, and call notify whenever a meaningful state change occurs",
      "Implement Concrete Observers: register with the subject and react to notifications by reading subject state or the pushed event payload",
      "Manage subscription lifecycle: ensure observers unsubscribe when no longer needed to prevent memory leaks and stale notifications"
    ],
    "steps_zh": [
      "定义主题接口：暴露注册、注销和通知观察者的方法，使主题独立于具体观察者类型",
      "定义观察者接口：声明主题将调用的 update 方法，携带相关状态或事件数据",
      "实现具体主题：维护观察者列表，管理状态，并在有意义的状态变更时调用通知方法",
      "实现具体观察者：向主题注册，通过读取主题状态或推送的事件负载对通知做出响应",
      "管理订阅生命周期：确保观察者在不再需要时取消订阅，防止内存泄漏和陈旧通知"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Subject",
      "Observer",
      "Notify",
      "Subscribe"
    ],
    "viz_labels_zh": [
      "主题",
      "观察者",
      "通知",
      "订阅"
    ],
    "related": [
      "strategy-pattern",
      "command-pattern"
    ],
    "tags": [
      "gof",
      "behavioral",
      "design-patterns",
      "event",
      "pub-sub",
      "oop"
    ],
    "origin_author": "Gamma, Helm, Johnson, Vlissides, 1994",
    "origin_source": "Design Patterns: Elements of Reusable Object-Oriented Software",
    "origin_source_zh": "《设计模式：可复用面向对象软件的基础》",
    "complexity": "intermediate",
    "when_to_use": [
      "One object's state change should trigger updates in an unknown or variable number of other objects",
      "You want to decouple the producer of events from the consumers without forcing them to know about each other",
      "Building reactive UIs, event-driven agent environments, or real-time data feeds where multiple components react to changes"
    ],
    "when_to_use_zh": [
      "一个对象的状态变化需要触发数量未知或可变的其他对象的更新",
      "希望将事件生产者与消费者解耦，无需双方互相了解",
      "构建响应式 UI、事件驱动的智能体环境或实时数据流，多个组件对变化做出响应"
    ],
    "core_concepts": [
      "Subject (Observable): maintains a list of observers and broadcasts notifications on state change",
      "Observer: the interface or callback that subjects invoke when notifying, decoupling it from concrete implementations",
      "Push vs Pull: subjects can push state in the notification payload (push model) or observers can query state after being pinged (pull model)"
    ],
    "core_concepts_zh": [
      "主题（可观察者）：维护观察者列表，在状态变化时广播通知",
      "观察者：主题在通知时调用的接口或回调，将其与具体实现解耦",
      "推送 vs 拉取：主题可在通知负载中推送状态（推送模式），或观察者在收到通知后查询状态（拉取模式）"
    ],
    "timeline": [
      [
        "1994",
        "GoF formalize Observer as one of 23 patterns; MVC's model-view notification already used the concept"
      ],
      [
        "2000",
        "Java introduces java.util.Observable and java.util.Observer in the standard library"
      ],
      [
        "2012",
        "Reactive Extensions (Rx) reframe Observer as IObservable/IObserver, adding backpressure and composition"
      ],
      [
        "2019",
        "Observer underpins modern frontend state libraries (Redux, MobX, Vue reactivity) and agent event loops"
      ]
    ],
    "timeline_zh": [
      [
        "1994",
        "GoF 将观察者模式正式化为 23 个模式之一；MVC 的模型-视图通知机制已在使用该概念"
      ],
      [
        "2000",
        "Java 在标准库中引入 java.util.Observable 和 java.util.Observer"
      ],
      [
        "2012",
        "响应式扩展（Rx）将观察者重塑为 IObservable/IObserver，增加背压和组合能力"
      ],
      [
        "2019",
        "观察者模式支撑现代前端状态库（Redux、MobX、Vue 响应式）和智能体事件循环"
      ]
    ],
    "dos": [
      "Do use weak references or explicit unsubscribe to prevent memory leaks when observers outlive their subjects",
      "Do keep observer update methods fast and non-blocking; offload heavy work to background threads or queues",
      "Do consider passing event data in the notification rather than forcing observers to call back into the subject"
    ],
    "dos_zh": [
      "使用弱引用或显式取消订阅，防止观察者比主题生命周期更长时造成内存泄漏",
      "保持观察者的 update 方法快速且非阻塞；将繁重工作卸载到后台线程或队列",
      "考虑在通知中传递事件数据，而非强制观察者回调主题"
    ],
    "donts": [
      "Don't create cascading notification chains where one observer triggers another subject's notification — it causes hard-to-trace update storms",
      "Don't notify observers in a non-deterministic order when observers depend on each other's effects",
      "Don't use Observer for simple one-to-one callbacks; a direct method call or delegate is simpler"
    ],
    "donts_zh": [
      "不要创建级联通知链，即一个观察者触发另一个主题的通知——这会导致难以追踪的更新风暴",
      "当观察者相互依赖对方效果时，不要以不确定的顺序通知观察者",
      "不要将观察者用于简单的一对一回调；直接方法调用或委托更简单"
    ],
    "case_study_company": "Vue.js",
    "case_study": "Vue.js 3's reactivity system is a direct application of the Observer pattern. Each reactive data property is a Subject; component render functions and computed properties are Observers automatically registered via dependency tracking during a getter trap. When a property changes, only the components that actually read that property are notified and re-rendered, achieving fine-grained reactivity without manual subscription management.",
    "case_study_zh": "Vue.js 3 的响应式系统是观察者模式的直接应用。每个响应式数据属性都是一个主题；组件渲染函数和计算属性是通过 getter 拦截期间的依赖追踪自动注册的观察者。当属性变化时，只有实际读取该属性的组件才会收到通知并重新渲染，实现细粒度响应性而无需手动管理订阅。",
    "when_not_to_use": [
      "Simple synchronous pipelines where direct method calls are clearer and the set of dependents is fixed",
      "Performance-critical tight loops where notification overhead is measurable",
      "When event ordering guarantees are critical and the broadcast model introduces non-determinism"
    ],
    "when_not_to_use_zh": [
      "简单的同步流水线，直接方法调用更清晰且依赖方集合固定",
      "性能关键的紧密循环，通知开销可被度量",
      "事件顺序保证至关重要而广播模型引入不确定性时"
    ],
    "adopters": [
      "Vue.js",
      "Angular (EventEmitter)",
      "RxJS",
      "Redux"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Gamma, E., Helm, R., Johnson, R., Vlissides, J. (1994). \"Design Patterns: Elements of Reusable Object-Oriented Software\". Addison-Wesley.",
    "secondary_sources": [
      "Freeman, E., Robson, E. (2004). \"Head First Design Patterns\". O'Reilly Media.",
      "Meijer, E. (2012). \"Your Mouse is a Database\". ACM Queue."
    ],
    "typed_relations": [
      {
        "slug": "strategy-pattern",
        "type": "related"
      },
      {
        "slug": "command-pattern",
        "type": "related"
      }
    ]
  },
  {
    "id": 227,
    "name": "Factory Method Pattern",
    "name_zh": "工厂方法模式",
    "slug": "factory-method-pattern",
    "category": "coding",
    "desc": "Delegate object creation to subclasses",
    "desc_zh": "将对象的创建委托给子类决定，使父类无需依赖具体产品类",
    "steps": [
      "Define the Product interface: declare the interface or abstract class that all created objects must conform to",
      "Declare the Creator with a factory method: write an abstract or virtual method in the Creator that returns a Product, leaving the concrete type unspecified",
      "Implement Concrete Creators: subclass the Creator and override the factory method to instantiate and return the appropriate Concrete Product",
      "Use the product through its interface: write the Creator's template business logic against the Product interface, not the concrete type",
      "Register or select creators dynamically: optionally use a registry or configuration to choose which Concrete Creator to instantiate at runtime"
    ],
    "steps_zh": [
      "定义产品接口：声明所有创建对象必须符合的接口或抽象类",
      "在创建者中声明工厂方法：在创建者中编写抽象或虚方法，返回产品类型，不指定具体类型",
      "实现具体创建者：对创建者进行子类化，重写工厂方法以实例化并返回相应的具体产品",
      "通过接口使用产品：基于产品接口编写创建者的模板业务逻辑，而非依赖具体类型",
      "动态注册或选择创建者：可选地使用注册表或配置在运行时选择实例化哪个具体创建者"
    ],
    "ai_relevant": false,
    "viz_type": "tree",
    "viz_labels": [
      "Creator",
      "Factory Method",
      "Product",
      "Concrete Product"
    ],
    "viz_labels_zh": [
      "创建者",
      "工厂方法",
      "产品接口",
      "具体产品"
    ],
    "related": [
      "abstract-factory-pattern",
      "solid-principles",
      "dependency-injection"
    ],
    "tags": [
      "gof",
      "creational",
      "design-patterns",
      "object-creation",
      "oop"
    ],
    "origin_author": "Gamma, Helm, Johnson, Vlissides, 1994",
    "origin_source": "Design Patterns: Elements of Reusable Object-Oriented Software",
    "origin_source_zh": "《设计模式：可复用面向对象软件的基础》",
    "complexity": "intermediate",
    "when_to_use": [
      "A class cannot anticipate the exact type of objects it must create, deferring the decision to subclasses",
      "You want to give subclasses control over which objects they create while sharing the surrounding template logic",
      "Frameworks need to create objects whose type is unknown to the framework itself but known to its users"
    ],
    "when_to_use_zh": [
      "一个类无法预期它必须创建的对象的确切类型，将决定权推迟到子类",
      "希望让子类控制创建哪些对象，同时共享周围的模板逻辑",
      "框架需要创建其自身未知但用户已知类型的对象"
    ],
    "core_concepts": [
      "Creator: the class that declares the factory method, often providing default implementation or a template method that calls it",
      "Concrete Creator: overrides the factory method to produce a specific product, encapsulating instantiation details",
      "Product polymorphism: the creator's logic works against the Product interface, making it independent of which concrete product is made"
    ],
    "core_concepts_zh": [
      "创建者：声明工厂方法的类，通常提供默认实现或调用工厂方法的模板方法",
      "具体创建者：重写工厂方法以生产特定产品，封装实例化细节",
      "产品多态：创建者的逻辑基于产品接口运行，独立于具体产品类型"
    ],
    "timeline": [
      [
        "1994",
        "GoF publish Factory Method as a core creational pattern alongside Abstract Factory"
      ],
      [
        "2000",
        "Factory Method becomes the backbone of Java's JDBC DriverManager and servlet container APIs"
      ],
      [
        "2009",
        "Spring Framework codifies factory methods via @Bean annotations, blending DI with factory semantics"
      ],
      [
        "2018",
        "Factory functions (not classes) become idiomatic in TypeScript and Kotlin, simplifying the pattern"
      ]
    ],
    "timeline_zh": [
      [
        "1994",
        "GoF 将工厂方法作为核心创建型模式与抽象工厂一并发布"
      ],
      [
        "2000",
        "工厂方法成为 Java JDBC DriverManager 和 Servlet 容器 API 的核心"
      ],
      [
        "2009",
        "Spring Framework 通过 @Bean 注解将工厂方法规范化，融合 DI 与工厂语义"
      ],
      [
        "2018",
        "工厂函数（而非类）在 TypeScript 和 Kotlin 中成为惯用写法，简化了该模式"
      ]
    ],
    "dos": [
      "Do define the factory method on an interface or abstract class so callers depend on the abstraction",
      "Do use the factory method to encapsulate all construction complexity — validation, configuration, dependency wiring",
      "Do prefer factory functions over factory classes in languages with first-class functions to reduce boilerplate"
    ],
    "dos_zh": [
      "在接口或抽象类上定义工厂方法，使调用方依赖抽象",
      "使用工厂方法封装所有构造复杂性——验证、配置、依赖装配",
      "在支持一等函数的语言中，优先使用工厂函数而非工厂类，以减少样板代码"
    ],
    "donts": [
      "Don't use Factory Method when a simple constructor is sufficient; it adds indirection without benefit",
      "Don't make the factory method do heavy work like I/O or network calls; it should assemble, not execute",
      "Don't confuse Factory Method with Static Factory Method — the GoF pattern requires subclassing for variation"
    ],
    "donts_zh": [
      "当简单构造函数足够时，不要使用工厂方法；它只增加间接层而无益处",
      "不要让工厂方法执行 I/O 或网络调用等繁重工作；它应负责组装，而非执行",
      "不要将工厂方法与静态工厂方法混淆——GoF 模式需要通过子类化实现变化"
    ],
    "case_study_company": "Java JDBC",
    "case_study": "Java's JDBC API is a textbook Factory Method implementation. DriverManager.getConnection() is the factory method — callers pass a connection URL, and the registered Driver (Concrete Creator) determines which Connection implementation to instantiate. Application code depends only on the java.sql.Connection interface, enabling transparent switching between MySQL, PostgreSQL, and Oracle drivers without changing a single line of business logic.",
    "case_study_zh": "Java 的 JDBC API 是工厂方法的教科书级实现。DriverManager.getConnection() 是工厂方法——调用方传入连接 URL，已注册的 Driver（具体创建者）决定实例化哪个 Connection 实现。应用代码只依赖 java.sql.Connection 接口，可在 MySQL、PostgreSQL 和 Oracle 驱动之间透明切换，无需修改任何业务逻辑。",
    "when_not_to_use": [
      "When there is only one product type and no anticipated need for variation",
      "When the overhead of subclassing Creators is not justified by the complexity of the creation logic",
      "When dependency injection already manages object creation centrally, making factories redundant"
    ],
    "when_not_to_use_zh": [
      "只有一种产品类型且没有预期变化需求时",
      "子类化创建者的开销不被创建逻辑的复杂性所证明时",
      "依赖注入已集中管理对象创建，使工厂变得多余时"
    ],
    "adopters": [
      "Java JDBC",
      "Spring Framework",
      "Angular",
      ".NET HttpClientFactory"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Gamma, E., Helm, R., Johnson, R., Vlissides, J. (1994). \"Design Patterns: Elements of Reusable Object-Oriented Software\". Addison-Wesley.",
    "secondary_sources": [
      "Freeman, E., Robson, E. (2004). \"Head First Design Patterns\". O'Reilly Media.",
      "Bloch, J. (2008). \"Effective Java, 2nd ed.\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "abstract-factory-pattern",
        "type": "related"
      },
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "dependency-injection",
        "type": "complement"
      }
    ]
  },
  {
    "id": 228,
    "name": "Abstract Factory Pattern",
    "name_zh": "抽象工厂模式",
    "slug": "abstract-factory-pattern",
    "category": "coding",
    "desc": "Create families of related objects without specifying concrete classes",
    "desc_zh": "在不指定具体类的情况下创建一系列相关或相互依赖的对象",
    "steps": [
      "Identify product families: group related product types (e.g., Button + Checkbox for a UI toolkit) into coherent families per platform or variant",
      "Define Abstract Product interfaces: declare an interface for each product type in the family so clients depend on abstractions",
      "Define the Abstract Factory interface: declare creation methods for each product type, returning Abstract Product interfaces",
      "Implement Concrete Factories: create one factory class per product family, implementing all creation methods to produce the matching product variants",
      "Configure the application with a Concrete Factory: inject or select the appropriate Concrete Factory at startup; all downstream code uses only the Abstract Factory"
    ],
    "steps_zh": [
      "识别产品族：将相关产品类型（如 UI 工具包的按钮和复选框）按平台或变体分组为连贯的产品族",
      "定义抽象产品接口：为族中每种产品类型声明接口，使客户端依赖抽象",
      "定义抽象工厂接口：为每种产品类型声明创建方法，返回抽象产品接口",
      "实现具体工厂：为每个产品族创建一个工厂类，实现所有创建方法以生产匹配的产品变体",
      "用具体工厂配置应用：在启动时注入或选择合适的具体工厂；所有下游代码只使用抽象工厂"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Abstract Factory",
      "Concrete Factory",
      "Product A",
      "Product B"
    ],
    "viz_labels_zh": [
      "抽象工厂",
      "具体工厂",
      "产品A",
      "产品B"
    ],
    "related": [
      "factory-method-pattern",
      "solid-principles",
      "dependency-injection"
    ],
    "tags": [
      "gof",
      "creational",
      "design-patterns",
      "object-creation",
      "oop"
    ],
    "origin_author": "Gamma, Helm, Johnson, Vlissides, 1994",
    "origin_source": "Design Patterns: Elements of Reusable Object-Oriented Software",
    "origin_source_zh": "《设计模式：可复用面向对象软件的基础》",
    "complexity": "intermediate",
    "when_to_use": [
      "A system must be independent of how its products are created and composed, and must work with multiple product families",
      "You want to enforce consistency across a family of related products (e.g., all UI widgets must match the same theme)",
      "You need to swap entire product families at runtime — e.g., switching between a cloud provider's SDK implementations"
    ],
    "when_to_use_zh": [
      "系统必须独立于其产品的创建和组合方式，且必须与多个产品族协作",
      "希望强制一系列相关产品之间的一致性（如所有 UI 控件必须符合同一主题）",
      "需要在运行时切换整个产品族——例如切换云服务商的 SDK 实现"
    ],
    "core_concepts": [
      "Abstract Factory: the interface declaring creation methods for all product types in a family",
      "Concrete Factory: implements the Abstract Factory to produce a coherent set of product variants for one platform or theme",
      "Product family consistency: the pattern guarantees that products from the same factory are designed to work together"
    ],
    "core_concepts_zh": [
      "抽象工厂：声明为族中所有产品类型创建方法的接口",
      "具体工厂：实现抽象工厂，为一个平台或主题生产一套连贯的产品变体",
      "产品族一致性：该模式保证来自同一工厂的产品被设计为协同工作"
    ],
    "timeline": [
      [
        "1994",
        "GoF introduce Abstract Factory as an extension of Factory Method for product families"
      ],
      [
        "1998",
        "Abstract Factory becomes the canonical solution for Java Look-and-Feel (PLAF) in Swing"
      ],
      [
        "2010",
        "Cloud SDKs adopt Abstract Factory to let applications switch between AWS, Azure, and GCP implementations"
      ],
      [
        "2020",
        "Abstract Factory influences plugin architectures in VS Code and IntelliJ for multi-platform extension APIs"
      ]
    ],
    "timeline_zh": [
      [
        "1994",
        "GoF 将抽象工厂作为工厂方法针对产品族的扩展引入"
      ],
      [
        "1998",
        "抽象工厂成为 Swing 中 Java 外观（PLAF）的标准解决方案"
      ],
      [
        "2010",
        "云 SDK 采用抽象工厂，使应用可在 AWS、Azure 和 GCP 实现间切换"
      ],
      [
        "2020",
        "抽象工厂影响 VS Code 和 IntelliJ 的插件架构，用于多平台扩展 API"
      ]
    ],
    "dos": [
      "Do introduce Abstract Factory only when you genuinely need multiple families; premature abstraction is costly",
      "Do keep each Concrete Factory cohesive — every product it creates should belong to the same family or theme",
      "Do inject the Abstract Factory at the application composition root so the rest of the code never knows which family is active"
    ],
    "dos_zh": [
      "仅在真正需要多个产品族时引入抽象工厂；过早抽象代价高昂",
      "保持每个具体工厂的内聚性——它创建的每个产品都应属于同一族或主题",
      "在应用组合根处注入抽象工厂，使其余代码永远不知道哪个产品族处于激活状态"
    ],
    "donts": [
      "Don't add new product types to an existing Abstract Factory lightly — it forces changes to all Concrete Factories",
      "Don't use Abstract Factory when Factory Method is sufficient; the added layer of abstraction has a real cost",
      "Don't let Concrete Factories hold mutable state; they should be stateless creation services"
    ],
    "donts_zh": [
      "不要轻易向现有抽象工厂添加新产品类型——这会迫使所有具体工厂做出修改",
      "当工厂方法已足够时，不要使用抽象工厂；额外的抽象层有真实代价",
      "不要让具体工厂持有可变状态；它们应该是无状态的创建服务"
    ],
    "case_study_company": "Java Swing",
    "case_study": "Java Swing's Pluggable Look-and-Feel (PLAF) architecture is Abstract Factory in action. The LookAndFeel class acts as the Abstract Factory, declaring creation methods for every UI component (Button, TextField, ScrollBar). Concrete factories like MetalLookAndFeel and WindowsLookAndFeel implement all methods, ensuring every widget produced in a session is visually consistent. Switching the entire UI theme requires changing only a single factory at startup.",
    "case_study_zh": "Java Swing 的可插拔外观（PLAF）架构是抽象工厂的实际应用。LookAndFeel 类充当抽象工厂，为每个 UI 组件（按钮、文本框、滚动条）声明创建方法。MetalLookAndFeel 和 WindowsLookAndFeel 等具体工厂实现所有方法，确保会话中产生的每个控件在视觉上保持一致。切换整个 UI 主题只需在启动时更换一个工厂。",
    "when_not_to_use": [
      "When the application only has one product family and no plans to support others",
      "Small projects where the interface hierarchy adds more boilerplate than it saves",
      "When dependency injection containers already manage multi-implementation selection, making an explicit factory redundant"
    ],
    "when_not_to_use_zh": [
      "应用只有一个产品族且没有支持其他族的计划时",
      "小型项目中，接口层次结构增加的样板代码多于节省的代码",
      "依赖注入容器已管理多实现选择，使显式工厂变得多余时"
    ],
    "adopters": [
      "Java Swing (PLAF)",
      "AWS SDK",
      "Qt Framework",
      "IntelliJ Plugin API"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Gamma, E., Helm, R., Johnson, R., Vlissides, J. (1994). \"Design Patterns: Elements of Reusable Object-Oriented Software\". Addison-Wesley.",
    "secondary_sources": [
      "Freeman, E., Robson, E. (2004). \"Head First Design Patterns\". O'Reilly Media.",
      "Larman, C. (2004). \"Applying UML and Patterns\". Prentice Hall."
    ],
    "typed_relations": [
      {
        "slug": "factory-method-pattern",
        "type": "extends"
      },
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "dependency-injection",
        "type": "complement"
      }
    ]
  },
  {
    "id": 229,
    "name": "Decorator Pattern",
    "name_zh": "装饰器模式",
    "slug": "decorator-pattern",
    "category": "coding",
    "desc": "Attach additional responsibilities to objects dynamically",
    "desc_zh": "动态地为对象添加额外职责，是子类化扩展功能的灵活替代方案",
    "steps": [
      "Define the Component interface: establish the interface that both concrete components and decorators implement, ensuring they are interchangeable",
      "Implement the Concrete Component: create the base object providing the core behavior to be decorated",
      "Create the abstract Decorator: implement the Component interface, hold a reference to a Component, and delegate all calls to it by default",
      "Implement Concrete Decorators: subclass the abstract Decorator and override methods to add behavior before or after delegating to the wrapped component",
      "Compose at runtime: wrap components in one or more decorators in any order to achieve the desired combination of behaviors"
    ],
    "steps_zh": [
      "定义组件接口：建立具体组件和装饰器都实现的接口，确保它们可互换",
      "实现具体组件：创建提供要被装饰的核心行为的基础对象",
      "创建抽象装饰器：实现组件接口，持有组件引用，默认将所有调用委托给它",
      "实现具体装饰器：对抽象装饰器进行子类化，重写方法以在委托给被包装组件前后添加行为",
      "在运行时组合：以任意顺序将组件包装在一个或多个装饰器中，实现所需的行为组合"
    ],
    "ai_relevant": false,
    "viz_type": "tree",
    "viz_labels": [
      "Component",
      "Decorator",
      "Concrete Decorator",
      "Wrapper"
    ],
    "viz_labels_zh": [
      "组件接口",
      "装饰器",
      "具体装饰器",
      "包装层"
    ],
    "related": [
      "solid-principles",
      "adapter-pattern",
      "strategy-pattern"
    ],
    "tags": [
      "gof",
      "structural",
      "design-patterns",
      "composition",
      "oop"
    ],
    "origin_author": "Gamma, Helm, Johnson, Vlissides, 1994",
    "origin_source": "Design Patterns: Elements of Reusable Object-Oriented Software",
    "origin_source_zh": "《设计模式：可复用面向对象软件的基础》",
    "complexity": "intermediate",
    "when_to_use": [
      "You need to add responsibilities to individual objects without affecting others of the same class",
      "Subclassing would produce an explosion of classes to cover every possible feature combination",
      "Behaviors need to be layered or stacked in different orders at runtime (e.g., logging + caching + auth on an HTTP handler)"
    ],
    "when_to_use_zh": [
      "需要为单个对象添加职责而不影响同类的其他对象",
      "子类化会因功能组合爆炸而导致类数量激增",
      "行为需要在运行时以不同顺序层叠（如 HTTP 处理器上的日志 + 缓存 + 认证）"
    ],
    "core_concepts": [
      "Wrapping: a decorator holds a reference to the component it wraps and forwards calls to it, adding behavior around the delegation",
      "Transparent interface: because decorators implement the same interface as the component, clients cannot tell whether they hold a raw component or a stack of decorators",
      "Open/Closed compliance: new behaviors are added by writing new decorators, never by modifying existing component or decorator code"
    ],
    "core_concepts_zh": [
      "包装：装饰器持有被包装组件的引用，将调用转发给它，并在委托前后添加行为",
      "透明接口：因为装饰器与组件实现相同的接口，客户端无法判断持有的是原始组件还是装饰器栈",
      "符合开闭原则：通过编写新装饰器添加新行为，从不修改现有组件或装饰器代码"
    ],
    "timeline": [
      [
        "1994",
        "GoF introduce Decorator as the primary alternative to subclassing for extending behavior"
      ],
      [
        "2000",
        "Java I/O streams (InputStream, BufferedInputStream, GZIPInputStream) popularize Decorator in practice"
      ],
      [
        "2015",
        "Python decorator syntax (@decorator) normalizes the pattern as a language feature, though it differs semantically"
      ],
      [
        "2022",
        "Middleware stacks in web frameworks (Express.js, ASP.NET Core) are recognized as Decorator chains"
      ]
    ],
    "timeline_zh": [
      [
        "1994",
        "GoF 将装饰器作为子类化扩展行为的主要替代方案引入"
      ],
      [
        "2000",
        "Java I/O 流（InputStream、BufferedInputStream、GZIPInputStream）在实践中推广装饰器模式"
      ],
      [
        "2015",
        "Python 装饰器语法（@decorator）将该模式规范化为语言特性，尽管语义有所不同"
      ],
      [
        "2022",
        "Web 框架中的中间件栈（Express.js、ASP.NET Core）被认可为装饰器链"
      ]
    ],
    "dos": [
      "Do ensure decorators are truly transparent by fully implementing the component interface, including edge-case methods",
      "Do keep each decorator focused on a single concern (logging, caching, auth) to maintain composability",
      "Do order decorators intentionally — the sequence matters when behaviors interact (e.g., cache before auth vs after)"
    ],
    "dos_zh": [
      "通过完整实现组件接口（包括边缘情况方法）确保装饰器真正透明",
      "保持每个装饰器专注于单一关注点（日志、缓存、认证）以维持可组合性",
      "有意地排列装饰器顺序——当行为相互交互时顺序很重要（如缓存在认证之前还是之后）"
    ],
    "donts": [
      "Don't use Decorator when the class hierarchy is stable and one or two subclasses would suffice",
      "Don't create deeply nested decorator stacks without documenting the intended order, as debugging them is difficult",
      "Don't break the component interface in a decorator by adding public methods that clients are expected to call directly"
    ],
    "donts_zh": [
      "当类层次结构稳定且一两个子类已足够时，不要使用装饰器",
      "不要在不记录预期顺序的情况下创建深层嵌套的装饰器栈，因为调试它们很困难",
      "不要在装饰器中通过添加客户端预期直接调用的公共方法来破坏组件接口"
    ],
    "case_study_company": "Java I/O",
    "case_study": "Java's java.io package is the most cited real-world Decorator implementation. Reading a gzip-compressed file over a network uses a chain: new GZIPInputStream(new BufferedInputStream(new FileInputStream(file))). Each wrapper adds one capability — buffering, decompression — to the underlying InputStream. The consuming code reads from InputStream regardless of how many decorators are stacked, demonstrating transparent composability that would have required dozens of subclasses without the pattern.",
    "case_study_zh": "Java 的 java.io 包是最常被引用的真实装饰器实现。通过网络读取 gzip 压缩文件使用了一条链：new GZIPInputStream(new BufferedInputStream(new FileInputStream(file)))。每个包装器向底层 InputStream 添加一种能力——缓冲、解压。消费代码无论堆叠了多少装饰器都从 InputStream 读取，展示了没有此模式就需要数十个子类才能实现的透明可组合性。",
    "when_not_to_use": [
      "When the component interface is large and implementing it fully in every decorator is burdensome",
      "When the order of decoration is complex and error-prone, and a Strategy or Chain of Responsibility is clearer",
      "Simple classes with one or two known extension points where straightforward subclassing is more readable"
    ],
    "when_not_to_use_zh": [
      "当组件接口很大，在每个装饰器中完整实现它很繁琐时",
      "当装饰顺序复杂且易出错，策略模式或职责链更清晰时",
      "具有一两个已知扩展点的简单类，直接子类化更易读"
    ],
    "adopters": [
      "Java I/O Streams",
      "Python (built-in @decorator)",
      "ASP.NET Core Middleware",
      "Express.js"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Gamma, E., Helm, R., Johnson, R., Vlissides, J. (1994). \"Design Patterns: Elements of Reusable Object-Oriented Software\". Addison-Wesley.",
    "secondary_sources": [
      "Freeman, E., Robson, E. (2004). \"Head First Design Patterns\". O'Reilly Media.",
      "Martin, R. C. (2002). \"Agile Software Development, Principles, Patterns, and Practices\". Prentice Hall."
    ],
    "typed_relations": [
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "adapter-pattern",
        "type": "related"
      },
      {
        "slug": "strategy-pattern",
        "type": "related"
      }
    ]
  },
  {
    "id": 230,
    "name": "Adapter Pattern",
    "name_zh": "适配器模式",
    "slug": "adapter-pattern",
    "category": "coding",
    "desc": "Convert one interface to another that clients expect",
    "desc_zh": "将一个类的接口转换为客户期望的另一个接口，使原本不兼容的类可以协作",
    "steps": [
      "Identify the Target interface: define the interface that the client code is written against and expects to interact with",
      "Identify the Adaptee: locate the existing class with a useful but incompatible interface that you cannot or do not want to modify",
      "Create the Adapter class: implement the Target interface and hold a reference to the Adaptee (object adapter) or inherit from it (class adapter)",
      "Delegate in each Target method: implement each Target method by translating the call and its parameters into the equivalent Adaptee call",
      "Use the Adapter transparently: configure or inject the Adapter where the client expects a Target; the client remains unaware of the Adaptee"
    ],
    "steps_zh": [
      "确定目标接口：定义客户端代码所针对并期望交互的接口",
      "确定被适配者：找到拥有有用但不兼容接口的现有类，您无法或不想修改它",
      "创建适配器类：实现目标接口，持有被适配者的引用（对象适配器）或继承它（类适配器）",
      "在每个目标方法中委托：通过将调用及其参数转换为等效的被适配者调用来实现每个目标方法",
      "透明使用适配器：在客户端期望目标的地方配置或注入适配器；客户端对被适配者一无所知"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Client",
      "Target Interface",
      "Adapter",
      "Adaptee"
    ],
    "viz_labels_zh": [
      "客户端",
      "目标接口",
      "适配器",
      "被适配者"
    ],
    "related": [
      "decorator-pattern",
      "solid-principles"
    ],
    "tags": [
      "gof",
      "structural",
      "design-patterns",
      "interface",
      "integration",
      "oop"
    ],
    "origin_author": "Gamma, Helm, Johnson, Vlissides, 1994",
    "origin_source": "Design Patterns: Elements of Reusable Object-Oriented Software",
    "origin_source_zh": "《设计模式：可复用面向对象软件的基础》",
    "complexity": "intermediate",
    "when_to_use": [
      "Integrating a third-party library or legacy component whose interface is incompatible with your system's expectations",
      "You want to reuse existing classes without modifying their source code",
      "Building an anti-corruption layer to translate between external and internal domain models"
    ],
    "when_to_use_zh": [
      "集成第三方库或遗留组件，其接口与系统期望不兼容",
      "希望复用现有类而不修改其源代码",
      "构建防腐层以在外部和内部领域模型之间进行转换"
    ],
    "core_concepts": [
      "Target: the interface clients expect; adapters implement this to become drop-in replacements",
      "Adaptee: the existing class with incompatible interface being wrapped by the adapter",
      "Object vs Class adapter: object adapters use composition (preferred); class adapters use multiple inheritance (language-dependent)"
    ],
    "core_concepts_zh": [
      "目标：客户端期望的接口；适配器实现此接口以成为可替换的插入式组件",
      "被适配者：被适配器包装的、具有不兼容接口的现有类",
      "对象适配器 vs 类适配器：对象适配器使用组合（推荐）；类适配器使用多重继承（取决于语言）"
    ],
    "timeline": [
      [
        "1994",
        "GoF formalize Adapter as a structural pattern for interface translation and legacy integration"
      ],
      [
        "2002",
        "Adapter becomes the standard approach for wrapping SOAP services behind clean Java interfaces in J2EE"
      ],
      [
        "2014",
        "Anti-Corruption Layer in DDD is widely recognized as a strategic application of the Adapter pattern"
      ],
      [
        "2023",
        "LLM tool/function adapters translate between agent tool schemas and underlying API signatures"
      ]
    ],
    "timeline_zh": [
      [
        "1994",
        "GoF 将适配器正式化为用于接口转换和遗留集成的结构型模式"
      ],
      [
        "2002",
        "适配器成为在 J2EE 中将 SOAP 服务包装在整洁 Java 接口后面的标准方法"
      ],
      [
        "2014",
        "DDD 中的防腐层被广泛认可为适配器模式的战略应用"
      ],
      [
        "2023",
        "LLM 工具/函数适配器在智能体工具模式和底层 API 签名之间进行转换"
      ]
    ],
    "dos": [
      "Do prefer object adapters (composition) over class adapters (inheritance) for better flexibility and testability",
      "Do keep the adapter thin — its only job is interface translation, not additional business logic",
      "Do create an adapter per external system boundary to isolate integration concerns and simplify testing with mock adaptees"
    ],
    "dos_zh": [
      "优先使用对象适配器（组合）而非类适配器（继承），以获得更好的灵活性和可测试性",
      "保持适配器精简——其唯一职责是接口转换，而非额外的业务逻辑",
      "为每个外部系统边界创建一个适配器，以隔离集成关注点并简化使用模拟被适配者的测试"
    ],
    "donts": [
      "Don't put business logic in the adapter; it should only translate, not transform semantics",
      "Don't create adapters for interfaces that could simply be updated; only adapt what you cannot change",
      "Don't chain multiple adapters together — it signals a deeper design problem that should be addressed structurally"
    ],
    "donts_zh": [
      "不要在适配器中放置业务逻辑；它只应翻译，而非转换语义",
      "不要为可以直接更新的接口创建适配器；只适配您无法更改的内容",
      "不要将多个适配器链接在一起——这表明存在应从结构上解决的更深层设计问题"
    ],
    "case_study_company": "AWS SDK",
    "case_study": "The AWS SDK for Java uses Adapter extensively in its service client wrappers. The SDK exposes clean, idiomatic Java interfaces (S3Client, DynamoDbClient) that serve as Targets, while internally adapting the raw HTTP request/response cycle of each AWS service API. Application teams depend only on the idiomatic Target interface; when AWS changes the underlying HTTP protocol or authentication mechanism, only the adapter layer changes, insulating all business logic from API churn.",
    "case_study_zh": "Java 版 AWS SDK 在其服务客户端包装器中广泛使用适配器。SDK 暴露整洁的、符合 Java 惯用法的接口（S3Client、DynamoDbClient）作为目标，同时在内部适配每个 AWS 服务 API 的原始 HTTP 请求/响应周期。应用团队只依赖惯用目标接口；当 AWS 更改底层 HTTP 协议或认证机制时，只有适配器层发生变化，使所有业务逻辑免受 API 变动的影响。",
    "when_not_to_use": [
      "When you control both interfaces and can simply unify them without an adapter",
      "When the semantic gap between Adaptee and Target is so large that translation would be complex and error-prone",
      "When extensive data transformation is required — consider a dedicated mapper or anti-corruption layer with richer logic instead"
    ],
    "when_not_to_use_zh": [
      "当您控制两个接口且可以直接统一它们而无需适配器时",
      "当被适配者和目标之间的语义差距如此之大，以至于转换会变得复杂且容易出错时",
      "当需要大量数据转换时——考虑使用具有更丰富逻辑的专用映射器或防腐层"
    ],
    "adopters": [
      "AWS SDK",
      "Spring Data",
      "SLF4J",
      "Jakarta EE Connectors"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "portability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Gamma, E., Helm, R., Johnson, R., Vlissides, J. (1994). \"Design Patterns: Elements of Reusable Object-Oriented Software\". Addison-Wesley.",
    "secondary_sources": [
      "Freeman, E., Robson, E. (2004). \"Head First Design Patterns\". O'Reilly Media.",
      "Evans, E. (2003). \"Domain-Driven Design\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "decorator-pattern",
        "type": "related"
      },
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "hexagonal-architecture",
        "type": "complement"
      }
    ]
  },
  {
    "id": 231,
    "name": "Singleton Pattern",
    "name_zh": "单例模式",
    "slug": "singleton-pattern",
    "category": "coding",
    "desc": "Ensure a class has only one instance with a global access point",
    "desc_zh": "确保一个类只有一个实例，并提供一个访问它的全局入口点",
    "steps": [
      "Make the constructor private: prevent external code from instantiating the class directly using the new keyword",
      "Declare a private static instance field: hold the single instance inside the class itself",
      "Provide a public static accessor method: implement getInstance() that creates the instance on first call and returns it on every subsequent call",
      "Handle thread safety: in multi-threaded environments, use double-checked locking, an initialization-on-demand holder, or language-level guarantees to prevent race conditions",
      "Consider dependency injection as an alternative: in modern applications prefer injecting a single shared instance via a DI container rather than relying on a static accessor"
    ],
    "steps_zh": [
      "将构造函数设为私有：防止外部代码直接使用 new 关键字实例化类",
      "声明私有静态实例字段：在类内部持有单一实例",
      "提供公共静态访问方法：实现 getInstance()，在第一次调用时创建实例，后续每次调用返回该实例",
      "处理线程安全：在多线程环境中，使用双重检查锁定、按需初始化持有者或语言级保证来防止竞态条件",
      "考虑依赖注入作为替代方案：在现代应用中，优先通过 DI 容器注入单一共享实例，而非依赖静态访问器"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Instance Check",
      "Single Instance",
      "Global Access",
      "Client"
    ],
    "viz_labels_zh": [
      "实例检查",
      "唯一实例",
      "全局访问",
      "客户端"
    ],
    "related": [
      "solid-principles",
      "dependency-injection",
      "factory-method-pattern"
    ],
    "tags": [
      "gof",
      "creational",
      "design-patterns",
      "instance-management",
      "oop"
    ],
    "origin_author": "Gamma, Helm, Johnson, Vlissides, 1994",
    "origin_source": "Design Patterns: Elements of Reusable Object-Oriented Software",
    "origin_source_zh": "《设计模式：可复用面向对象软件的基础》",
    "complexity": "beginner",
    "when_to_use": [
      "Exactly one shared resource must exist — e.g., a thread pool, configuration registry, or hardware interface driver",
      "Global coordination is required and the cost of multiple instances would cause correctness issues (e.g., duplicate caches)",
      "A legacy codebase cannot use dependency injection and needs a controlled global access point"
    ],
    "when_to_use_zh": [
      "必须存在唯一一个共享资源——如线程池、配置注册表或硬件接口驱动",
      "需要全局协调，多个实例的代价会导致正确性问题（如重复缓存）",
      "遗留代码库无法使用依赖注入，需要一个受控的全局访问点"
    ],
    "core_concepts": [
      "Single instance guarantee: only one object of the class can exist in the process; enforced by private constructor",
      "Global access point: the static getInstance() method provides a well-known entry to the single instance from anywhere in the codebase",
      "Lazy vs eager initialization: the instance can be created on first access (lazy) or at class load time (eager), each with different thread-safety implications"
    ],
    "core_concepts_zh": [
      "单实例保证：进程中只能存在该类的一个对象；通过私有构造函数强制实现",
      "全局访问点：静态 getInstance() 方法提供从代码库任何地方访问单一实例的知名入口",
      "懒加载 vs 饿加载：实例可在首次访问时创建（懒加载）或在类加载时创建（饿加载），各有不同的线程安全含义"
    ],
    "timeline": [
      [
        "1994",
        "GoF introduce Singleton as a creational pattern; it quickly becomes one of the most recognized patterns"
      ],
      [
        "2004",
        "Singleton earns criticism as an anti-pattern due to hidden global state, testing difficulties, and tight coupling"
      ],
      [
        "2009",
        "Spring and Guice DI containers popularize singleton-scoped beans as a managed, testable alternative"
      ],
      [
        "2020",
        "Singleton remains useful for low-level infrastructure (loggers, metrics registries) but avoided in domain logic"
      ]
    ],
    "timeline_zh": [
      [
        "1994",
        "GoF 将单例作为创建型模式引入；它迅速成为最广为人知的模式之一"
      ],
      [
        "2004",
        "由于隐藏全局状态、测试困难和紧耦合，单例因被视为反模式而受到批评"
      ],
      [
        "2009",
        "Spring 和 Guice DI 容器推广单例作用域 Bean，作为可管理、可测试的替代方案"
      ],
      [
        "2020",
        "单例对于低级基础设施（日志记录器、指标注册表）仍有用，但在领域逻辑中应避免使用"
      ]
    ],
    "dos": [
      "Do prefer DI-container-managed singletons over static getInstance() for testability and lifecycle control",
      "Do use thread-safe initialization (enum singleton in Java, module-level object in Python) to avoid subtle concurrency bugs",
      "Do limit Singleton to stateless or append-only resources; mutable shared state is a concurrency hazard"
    ],
    "dos_zh": [
      "优先使用 DI 容器管理的单例而非静态 getInstance()，以获得可测试性和生命周期控制",
      "使用线程安全的初始化方式（Java 中的枚举单例、Python 中的模块级对象）以避免微妙的并发缺陷",
      "将单例限制于无状态或只追加的资源；可变共享状态是并发风险"
    ],
    "donts": [
      "Don't use Singleton for domain objects — it makes them impossible to vary, test in isolation, or run concurrently",
      "Don't store mutable business state in a Singleton because it introduces race conditions and makes tests order-dependent",
      "Don't use Singleton as a convenient global variable bag — it is an architectural coupling that erodes maintainability over time"
    ],
    "donts_zh": [
      "不要将单例用于领域对象——这使它们无法变化、无法隔离测试或并发运行",
      "不要在单例中存储可变业务状态，因为这会引入竞态条件并使测试依赖于顺序",
      "不要将单例用作方便的全局变量袋——它是一种随时间侵蚀可维护性的架构耦合"
    ],
    "case_study_company": "Log4j / SLF4J",
    "case_study": "The logging ecosystem (Log4j, Logback, SLF4J) uses a managed Singleton approach for the LoggerFactory. The root logger and its appenders are singletons scoped to the JVM process, ensuring all threads write to the same configured output destinations without creating duplicate file handles or losing log entries. Modern logging frameworks implement this via class-loading guarantees (not double-checked locking), demonstrating the correct, thread-safe form of the pattern.",
    "case_study_zh": "日志生态系统（Log4j、Logback、SLF4J）为 LoggerFactory 使用托管单例方式。根日志记录器及其附加器是作用于 JVM 进程的单例，确保所有线程写入相同的已配置输出目的地，而不会创建重复的文件句柄或丢失日志条目。现代日志框架通过类加载保证（而非双重检查锁定）实现这一点，展示了模式的正确、线程安全形式。",
    "when_not_to_use": [
      "Domain model objects that represent real-world entities with potentially multiple instances",
      "Services that need to be substituted with test doubles in unit testing",
      "Any case where multiple independent instances improve correctness, throughput, or testability"
    ],
    "when_not_to_use_zh": [
      "代表现实世界实体、可能有多个实例的领域模型对象",
      "需要在单元测试中用测试替身替代的服务",
      "多个独立实例能提高正确性、吞吐量或可测试性的任何情况"
    ],
    "adopters": [
      "Log4j / SLF4J",
      "Spring (singleton scope)",
      "Node.js module system",
      "Android Application class"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Gamma, E., Helm, R., Johnson, R., Vlissides, J. (1994). \"Design Patterns: Elements of Reusable Object-Oriented Software\". Addison-Wesley.",
    "secondary_sources": [
      "Freeman, E., Robson, E. (2004). \"Head First Design Patterns\". O'Reilly Media.",
      "Bloch, J. (2008). \"Effective Java, 2nd ed.\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "solid-principles",
        "type": "related"
      },
      {
        "slug": "dependency-injection",
        "type": "alternative"
      },
      {
        "slug": "factory-method-pattern",
        "type": "related"
      }
    ]
  },
  {
    "id": 232,
    "name": "Command Pattern",
    "name_zh": "命令模式",
    "slug": "command-pattern",
    "category": "coding",
    "desc": "Encapsulate a request as an object for undo, queue, or logging",
    "desc_zh": "将请求封装为对象，从而支持撤销、排队或日志记录等操作",
    "steps": [
      "Define the Command interface: declare an execute() method (and optionally undo()) that all command objects must implement",
      "Implement Concrete Commands: create one class per operation, storing the Receiver reference and any parameters needed to carry out the action",
      "Build the Invoker: hold a Command reference (or queue), call execute() at the appropriate time, and optionally maintain a history stack for undo",
      "Configure the Receiver: the Receiver contains the actual business logic; Concrete Commands delegate to it rather than implementing the operation themselves",
      "Compose macro commands: combine multiple Commands into a CompositeCommand to execute complex workflows atomically or with unified undo support"
    ],
    "steps_zh": [
      "定义命令接口：声明所有命令对象必须实现的 execute() 方法（以及可选的 undo()）",
      "实现具体命令：为每个操作创建一个类，存储接收者引用和执行操作所需的任何参数",
      "构建调用者：持有命令引用（或队列），在适当时机调用 execute()，并可选地维护用于撤销的历史栈",
      "配置接收者：接收者包含实际的业务逻辑；具体命令委托给它而非自己实现操作",
      "组合宏命令：将多个命令组合成复合命令，以原子方式或统一撤销支持执行复杂工作流"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Client",
      "Invoker",
      "Command",
      "Receiver"
    ],
    "viz_labels_zh": [
      "客户端",
      "调用者",
      "命令",
      "接收者"
    ],
    "related": [
      "strategy-pattern",
      "observer-pattern",
      "solid-principles"
    ],
    "tags": [
      "gof",
      "behavioral",
      "design-patterns",
      "undo",
      "queue",
      "oop"
    ],
    "origin_author": "Gamma, Helm, Johnson, Vlissides, 1994",
    "origin_source": "Design Patterns: Elements of Reusable Object-Oriented Software",
    "origin_source_zh": "《设计模式：可复用面向对象软件的基础》",
    "complexity": "intermediate",
    "when_to_use": [
      "You need undoable operations — encapsulating each action with an undo() method enables a full undo/redo history",
      "Operations must be queued, scheduled, or transmitted across a network as serializable objects",
      "An AI agent needs to plan, validate, and selectively execute or roll back a sequence of actions"
    ],
    "when_to_use_zh": [
      "需要可撤销操作——将每个动作与 undo() 方法封装，实现完整的撤销/重做历史",
      "操作必须作为可序列化对象进行排队、调度或通过网络传输",
      "AI 智能体需要规划、验证并选择性地执行或回滚一系列动作"
    ],
    "core_concepts": [
      "Command object: encapsulates an operation as a first-class object, decoupling the sender from the receiver and the timing of execution",
      "Invoker: triggers command execution and can maintain a command history for undo/redo, retry, or audit logging",
      "Receiver: the object that knows how to perform the actual work; commands delegate to it, keeping themselves thin"
    ],
    "core_concepts_zh": [
      "命令对象：将操作封装为一等对象，将发送者与接收者及执行时机解耦",
      "调用者：触发命令执行，可维护命令历史以支持撤销/重做、重试或审计日志",
      "接收者：知道如何执行实际工作的对象；命令委托给它，使自身保持精简"
    ],
    "timeline": [
      [
        "1994",
        "GoF formalize Command as a behavioral pattern, unifying transactions, macros, and UI actions under one concept"
      ],
      [
        "2003",
        "Command becomes the backbone of GUI undo/redo in editors like Eclipse and IntelliJ IDEA"
      ],
      [
        "2014",
        "Event sourcing architectures adopt Command as the write model, storing commands as an immutable audit log"
      ],
      [
        "2023",
        "LLM agent frameworks (LangChain, AutoGen) model tool calls as Command objects enabling replay and rollback"
      ]
    ],
    "timeline_zh": [
      [
        "1994",
        "GoF 将命令正式化为行为型模式，将事务、宏和 UI 动作统一在一个概念下"
      ],
      [
        "2003",
        "命令成为 Eclipse 和 IntelliJ IDEA 等编辑器中 GUI 撤销/重做的核心"
      ],
      [
        "2014",
        "事件溯源架构将命令作为写模型，将命令存储为不可变的审计日志"
      ],
      [
        "2023",
        "LLM 智能体框架（LangChain、AutoGen）将工具调用建模为命令对象，支持重放和回滚"
      ]
    ],
    "dos": [
      "Do implement undo() alongside execute() from the start if reversibility is a requirement — retrofitting it later is painful",
      "Do keep Command objects immutable and serializable so they can be logged, transmitted, and replayed reliably",
      "Do use Command queues for rate limiting, retry logic, and background processing to decouple production from consumption"
    ],
    "dos_zh": [
      "如果需要可逆性，从一开始就与 execute() 一起实现 undo()——事后补加很痛苦",
      "保持命令对象不可变且可序列化，以便可靠地记录、传输和重放",
      "使用命令队列进行速率限制、重试逻辑和后台处理，将生产与消费解耦"
    ],
    "donts": [
      "Don't put business logic in the Command itself — delegate to the Receiver and keep the Command as a thin action descriptor",
      "Don't create a Command class for every single trivial method call; overhead is only justified when queuing, undo, or logging is needed",
      "Don't neglect failure handling in undo() — a partial undo that leaves the system in an inconsistent state is worse than no undo"
    ],
    "donts_zh": [
      "不要将业务逻辑放在命令本身——委托给接收者，保持命令作为精简的动作描述符",
      "不要为每个简单的方法调用创建命令类；只有在需要排队、撤销或日志记录时开销才是合理的",
      "不要忽视 undo() 中的失败处理——将系统置于不一致状态的部分撤销比没有撤销更糟糕"
    ],
    "case_study_company": "Microsoft Word",
    "case_study": "Microsoft Word's undo/redo system is a canonical Command pattern implementation. Every user action — typing a character, changing font size, inserting an image — is encapsulated as a Command object stored in a history stack. Ctrl+Z pops and calls undo(); Ctrl+Y pushes and calls execute() again. This architecture allows Word to maintain a 100-step undo history across arbitrarily complex nested formatting operations, and the same Command objects are reused for macro recording and the Track Changes feature.",
    "case_study_zh": "Microsoft Word 的撤销/重做系统是命令模式的典型实现。每个用户操作——输入字符、更改字体大小、插入图片——都被封装为存储在历史栈中的命令对象。Ctrl+Z 弹出并调用 undo()；Ctrl+Y 压入并再次调用 execute()。这种架构使 Word 能够在任意复杂的嵌套格式化操作中维护 100 步的撤销历史，同样的命令对象被复用于宏录制和修订追踪功能。",
    "when_not_to_use": [
      "Simple request-response flows with no need for undo, queuing, or logging overhead",
      "High-frequency operations (millions per second) where object allocation per-command is a measurable bottleneck",
      "Purely functional codebases where immutable data transformations are already first-class and eliminate the need for Command objects"
    ],
    "when_not_to_use_zh": [
      "无需撤销、排队或日志开销的简单请求-响应流",
      "高频操作（每秒数百万次），每个命令的对象分配是可测量的瓶颈",
      "纯函数式代码库，不可变数据转换已是一等公民，无需命令对象"
    ],
    "adopters": [
      "Microsoft Word / Office",
      "Eclipse IDE",
      "Git (commits as commands)",
      "Apache Kafka (log as command store)"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Gamma, E., Helm, R., Johnson, R., Vlissides, J. (1994). \"Design Patterns: Elements of Reusable Object-Oriented Software\". Addison-Wesley.",
    "secondary_sources": [
      "Freeman, E., Robson, E. (2004). \"Head First Design Patterns\". O'Reilly Media.",
      "Martin, R. C. (2002). \"Agile Software Development, Principles, Patterns, and Practices\". Prentice Hall."
    ],
    "typed_relations": [
      {
        "slug": "strategy-pattern",
        "type": "related"
      },
      {
        "slug": "observer-pattern",
        "type": "related"
      },
      {
        "slug": "solid-principles",
        "type": "complement"
      }
    ]
  },
  {
    "id": 233,
    "name": "Template Method Pattern",
    "name_zh": "模板方法模式",
    "slug": "template-method-pattern",
    "category": "coding",
    "desc": "Define algorithm skeleton in base class, let subclasses override specific steps",
    "desc_zh": "在基类中定义算法骨架，允许子类覆盖特定步骤而不改变算法整体结构",
    "steps": [
      "Define the abstract base class: declare the template method as a final (non-overridable) method that calls a sequence of primitive operations in the correct order",
      "Identify primitive operations: split the algorithm into steps — mark required steps as abstract methods and optional steps as hook methods with default implementations",
      "Implement concrete subclasses: override only the primitive operations that differ, leaving the overall algorithm structure untouched in the base class",
      "Apply hook methods for optional behavior: provide empty or no-op hook implementations in the base class so subclasses can optionally extend behavior at defined extension points",
      "Favor composition if variability grows: when the number of overridable steps proliferates, consider replacing inheritance with Strategy objects injected into a single template class"
    ],
    "steps_zh": [
      "定义抽象基类：将模板方法声明为 final（不可覆盖）方法，按正确顺序调用一系列基本操作",
      "识别基本操作：将算法拆分为步骤——将必须实现的步骤标记为抽象方法，可选步骤标记为带默认实现的钩子方法",
      "实现具体子类：只覆盖不同的基本操作，保持基类中的整体算法结构不变",
      "使用钩子方法实现可选行为：在基类中提供空的或无操作的钩子实现，使子类可以在定义的扩展点选择性地扩展行为",
      "当可变性增长时倾向组合：当可覆盖步骤数量激增时，考虑用注入单一模板类的策略对象替代继承"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Template",
      "Abstract Step",
      "Concrete Step",
      "Hook"
    ],
    "viz_labels_zh": [
      "模板方法",
      "抽象步骤",
      "具体步骤",
      "钩子"
    ],
    "related": [
      "strategy-pattern",
      "solid-principles",
      "factory-method-pattern"
    ],
    "tags": [
      "gof",
      "behavioral",
      "design-patterns",
      "inheritance",
      "oop"
    ],
    "origin_author": "Gamma, Helm, Johnson, Vlissides, 1994",
    "origin_source": "Design Patterns: Elements of Reusable Object-Oriented Software",
    "origin_source_zh": "《设计模式：可复用面向对象软件的基础》",
    "complexity": "beginner",
    "when_to_use": [
      "Multiple subclasses share the same algorithm structure but differ in one or more concrete steps",
      "You want to enforce a fixed sequence of operations while allowing flexibility at specific extension points",
      "Eliminating code duplication across related classes by pulling the invariant parts into a common base"
    ],
    "when_to_use_zh": [
      "多个子类共享相同的算法结构，但在一个或多个具体步骤上有所不同",
      "需要强制执行固定的操作顺序，同时允许在特定扩展点灵活处理",
      "通过将不变部分提取到公共基类中，消除相关类之间的代码重复"
    ],
    "core_concepts": [
      "Template method: the invariant algorithm skeleton defined in the base class as a final method that orchestrates calls to primitive operations",
      "Primitive operations: abstract or hook methods that represent the variable parts of the algorithm; subclasses override these to specialize behavior",
      "Hollywood Principle: the base class calls subclass methods, not the other way around — high-level components control flow and low-level components fill in details"
    ],
    "core_concepts_zh": [
      "模板方法：在基类中定义为 final 方法的不变算法骨架，负责编排对基本操作的调用",
      "基本操作：代表算法可变部分的抽象方法或钩子方法；子类通过覆盖这些方法来专门化行为",
      "好莱坞原则：基类调用子类方法，而非反过来——高层组件控制流程，低层组件填充细节"
    ],
    "timeline": [
      [
        "1994",
        "GoF formalize Template Method as a behavioral pattern, observing it was already widely used in object-oriented frameworks"
      ],
      [
        "2000",
        "Java AWT and Swing adopt Template Method extensively in component lifecycle callbacks (paint, update, validate)"
      ],
      [
        "2005",
        "JUnit test framework uses Template Method in TestCase, with setUp() and tearDown() as hook methods around each test"
      ],
      [
        "2015",
        "Modern frameworks favor composition and lambda callbacks over inheritance-based template methods to reduce coupling"
      ]
    ],
    "timeline_zh": [
      [
        "1994",
        "GoF 将模板方法正式化为行为型模式，注意到它已在面向对象框架中被广泛使用"
      ],
      [
        "2000",
        "Java AWT 和 Swing 在组件生命周期回调（paint、update、validate）中大量采用模板方法"
      ],
      [
        "2005",
        "JUnit 测试框架在 TestCase 中使用模板方法，setUp() 和 tearDown() 作为每个测试前后的钩子方法"
      ],
      [
        "2015",
        "现代框架倾向于使用组合和 lambda 回调而非基于继承的模板方法，以降低耦合"
      ]
    ],
    "dos": [
      "Do declare the template method final to prevent subclasses from accidentally breaking the algorithm invariant",
      "Do minimize the number of abstract steps to reduce the implementation burden on concrete subclasses",
      "Do document the intended contract of each primitive operation so subclass authors understand what invariants they must preserve"
    ],
    "dos_zh": [
      "将模板方法声明为 final，防止子类意外破坏算法不变量",
      "最小化抽象步骤的数量，减少具体子类的实现负担",
      "记录每个基本操作的预期契约，使子类作者了解必须保持哪些不变量"
    ],
    "donts": [
      "Don't override the template method in subclasses — doing so defeats the entire purpose of the pattern and leads to divergent algorithm variants",
      "Don't create deep inheritance hierarchies using Template Method because they become brittle and hard to understand as the hierarchy grows",
      "Don't use Template Method when the varying behavior is unrelated to a shared algorithm structure — Strategy composition is more appropriate"
    ],
    "donts_zh": [
      "不要在子类中覆盖模板方法——这样做违背了模式的整体目的，导致算法变体分歧",
      "不要使用模板方法创建深度继承层次结构，因为随着层次结构的增长，它们会变得脆弱且难以理解",
      "当可变行为与共享算法结构无关时，不要使用模板方法——策略组合更为合适"
    ],
    "case_study_company": "JUnit",
    "case_study": "JUnit's TestCase class is a textbook Template Method implementation. The runBare() method is the template — it calls setUp(), then run(), then tearDown() in a guaranteed sequence regardless of what happens inside the test. Test authors override only setUp() and tearDown() with their specific fixture logic, while JUnit controls the lifecycle. This allowed thousands of test classes to implement consistent setup/teardown behavior without duplicating the error-handling and lifecycle management code that JUnit provides centrally.",
    "case_study_zh": "JUnit 的 TestCase 类是模板方法的教科书式实现。runBare() 方法是模板——它以固定顺序调用 setUp()、run()、tearDown()，无论测试内部发生什么。测试作者只需用特定的固件逻辑覆盖 setUp() 和 tearDown()，而 JUnit 控制生命周期。这使数千个测试类能够实现一致的 setup/teardown 行为，而无需重复 JUnit 集中提供的错误处理和生命周期管理代码。",
    "when_not_to_use": [
      "When behavior varies along multiple independent dimensions simultaneously — use Strategy objects composed together instead",
      "When the algorithm steps have no natural fixed ordering and callers need full control over sequencing",
      "When the codebase favors functional programming where higher-order functions achieve the same result without inheritance"
    ],
    "when_not_to_use_zh": [
      "当行为同时沿多个独立维度变化时——改用组合在一起的策略对象",
      "当算法步骤没有自然的固定顺序，且调用方需要完全控制排序时",
      "当代码库倾向函数式编程，高阶函数可以在不使用继承的情况下达到相同效果时"
    ],
    "adopters": [
      "JUnit (TestCase lifecycle)",
      "Java AWT / Swing",
      "Spring (JdbcTemplate)",
      "Android Activity lifecycle"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Gamma, E., Helm, R., Johnson, R., Vlissides, J. (1994). \"Design Patterns: Elements of Reusable Object-Oriented Software\". Addison-Wesley.",
    "secondary_sources": [
      "Freeman, E., Robson, E. (2004). \"Head First Design Patterns\". O'Reilly Media.",
      "Kerievsky, J. (2004). \"Refactoring to Patterns\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "strategy-pattern",
        "type": "alternative"
      },
      {
        "slug": "factory-method-pattern",
        "type": "related"
      },
      {
        "slug": "solid-principles",
        "type": "complement"
      }
    ]
  },
  {
    "id": 234,
    "name": "State Pattern",
    "name_zh": "状态模式",
    "slug": "state-pattern",
    "category": "coding",
    "desc": "Allow object behavior to change automatically when its internal state changes",
    "desc_zh": "允许对象在内部状态改变时自动改变其行为，使其看起来像改变了类",
    "steps": [
      "Identify the states: enumerate all distinct states the context object can be in and the transitions between them",
      "Define the State interface: declare all context-behavior methods that vary by state; every concrete state must implement this interface",
      "Implement Concrete State classes: one class per state, each implementing the behavior appropriate to that state and triggering transitions by calling context.setState() when conditions are met",
      "Configure the Context class: hold a reference to the current State object, delegate behavior calls to it, and expose setState() so state objects can drive transitions",
      "Initialize and transition: set the initial state in the Context constructor and let the state machine run by having concrete states or the context trigger transitions in response to events"
    ],
    "steps_zh": [
      "识别状态：枚举上下文对象可能处于的所有不同状态及其转换关系",
      "定义状态接口：声明所有随状态变化的上下文行为方法；每个具体状态必须实现此接口",
      "实现具体状态类：每个状态一个类，各自实现该状态下的适当行为，并在满足条件时通过调用 context.setState() 触发转换",
      "配置上下文类：持有对当前状态对象的引用，将行为调用委托给它，并暴露 setState() 使状态对象能驱动转换",
      "初始化和转换：在上下文构造函数中设置初始状态，让状态机运行，由具体状态或上下文响应事件触发转换"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Context",
      "State",
      "Transition",
      "Behavior"
    ],
    "viz_labels_zh": [
      "上下文",
      "状态",
      "状态转换",
      "行为"
    ],
    "related": [
      "strategy-pattern",
      "command-pattern",
      "observer-pattern"
    ],
    "tags": [
      "gof",
      "behavioral",
      "design-patterns",
      "state-machine",
      "oop"
    ],
    "origin_author": "Gamma, Helm, Johnson, Vlissides, 1994",
    "origin_source": "Design Patterns: Elements of Reusable Object-Oriented Software",
    "origin_source_zh": "《设计模式：可复用面向对象软件的基础》",
    "complexity": "intermediate",
    "when_to_use": [
      "An object's behavior depends heavily on its current state and must change at runtime as state transitions occur",
      "Large switch/if-else blocks handling state-dependent logic are becoming hard to maintain and extend",
      "Modeling workflows, protocol parsers, UI wizards, or game characters with distinct operational modes"
    ],
    "when_to_use_zh": [
      "对象的行为严重依赖于当前状态，并且必须随运行时状态转换而改变",
      "处理状态相关逻辑的大型 switch/if-else 块变得难以维护和扩展",
      "对工作流、协议解析器、UI 向导或具有不同操作模式的游戏角色进行建模"
    ],
    "core_concepts": [
      "Context: the object whose behavior varies; it delegates all state-dependent requests to the current State object and holds a reference that can be swapped at runtime",
      "State interface: defines the contract for all behaviors that differ across states, ensuring the Context can treat all states uniformly",
      "Concrete States: encapsulate the behavior and transition logic for one specific state, replacing large conditionals with focused, single-responsibility classes"
    ],
    "core_concepts_zh": [
      "上下文：行为发生变化的对象；它将所有状态相关请求委托给当前状态对象，并持有可在运行时交换的引用",
      "状态接口：定义跨状态不同的所有行为契约，确保上下文可以统一处理所有状态",
      "具体状态：封装一个特定状态的行为和转换逻辑，用专注、单一职责的类替代大型条件语句"
    ],
    "timeline": [
      [
        "1994",
        "GoF introduce State as a behavioral pattern, noting its close relationship with finite-state machines from computer science"
      ],
      [
        "2000",
        "Game development community adopts State pattern extensively for character AI, game modes, and UI screen transitions"
      ],
      [
        "2010",
        "Statecharts and tools like XState bring hierarchical and parallel state machines to frontend and Node.js applications"
      ],
      [
        "2020",
        "XState v4 gains mainstream adoption in React ecosystems, reviving interest in explicit state machine modeling for UI"
      ]
    ],
    "timeline_zh": [
      [
        "1994",
        "GoF 引入状态作为行为型模式，注意到其与计算机科学中有限状态机的密切关系"
      ],
      [
        "2000",
        "游戏开发社区广泛采用状态模式用于角色 AI、游戏模式和 UI 屏幕转换"
      ],
      [
        "2010",
        "状态图和 XState 等工具将层次化和并行状态机引入前端和 Node.js 应用"
      ],
      [
        "2020",
        "XState v4 在 React 生态系统中获得主流采用，重新激发了对 UI 显式状态机建模的兴趣"
      ]
    ],
    "dos": [
      "Do let State objects know about each other or use named state constants on the Context to keep transitions explicit and auditable",
      "Do keep each Concrete State class small and focused on one state's behavior to preserve single-responsibility",
      "Do use State pattern alongside event logging to create an audit trail of every state transition for debugging and monitoring"
    ],
    "dos_zh": [
      "允许状态对象相互了解，或在上下文上使用命名状态常量，使转换显式且可审计",
      "保持每个具体状态类小巧，专注于一个状态的行为，以保持单一职责",
      "将状态模式与事件日志记录结合使用，为每次状态转换创建审计追踪，便于调试和监控"
    ],
    "donts": [
      "Don't scatter transition logic across both State classes and the Context — choose one location and keep it consistent",
      "Don't create a new State object on every transition if State objects are stateless — share flyweight instances to avoid allocation overhead",
      "Don't use State pattern for simple two-state toggles; a boolean field is clearer and less over-engineered"
    ],
    "donts_zh": [
      "不要将转换逻辑分散在状态类和上下文两者中——选择一个位置并保持一致",
      "如果状态对象是无状态的，不要在每次转换时创建新的状态对象——共享享元实例以避免分配开销",
      "不要对简单的双状态切换使用状态模式；布尔字段更清晰，过度设计更少"
    ],
    "case_study_company": "XState / Stately",
    "case_study": "XState, the JavaScript state machine library, applies the State pattern at scale in frontend applications. A traffic-light component modeled in XState has explicit states (red, yellow, green) with guarded transitions, entry/exit actions, and parallel states for pedestrian signals. Netflix uses XState to manage the complex state of their video player — buffering, playing, paused, error, ad-break — ensuring that user interactions like clicking play during buffering produce predictable, testable outcomes rather than ad hoc if/else chains.",
    "case_study_zh": "XState（JavaScript 状态机库）在前端应用中大规模应用状态模式。用 XState 建模的交通灯组件具有显式状态（红、黄、绿），带守卫的转换、进入/退出动作以及行人信号的并行状态。Netflix 使用 XState 管理其视频播放器的复杂状态——缓冲、播放、暂停、错误、广告时段——确保在缓冲期间点击播放等用户交互产生可预测、可测试的结果，而非临时的 if/else 链。",
    "when_not_to_use": [
      "Objects with only one or two states where a simple boolean or enum is sufficient and self-documenting",
      "When state transitions are rare and the complexity of multiple state classes outweighs the maintenance benefit",
      "Stateless services or pure functions that have no mutable internal state to manage"
    ],
    "when_not_to_use_zh": [
      "只有一两个状态的对象，简单的布尔值或枚举就足够且自文档化",
      "当状态转换很少，多个状态类的复杂性超过维护收益时",
      "无状态服务或纯函数，没有可变内部状态需要管理"
    ],
    "adopters": [
      "XState / Stately",
      "Unity (game AI state machines)",
      "Apache Commons SCXML",
      "Redux (reducer as state machine)"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Gamma, E., Helm, R., Johnson, R., Vlissides, J. (1994). \"Design Patterns: Elements of Reusable Object-Oriented Software\". Addison-Wesley.",
    "secondary_sources": [
      "Freeman, E., Robson, E. (2004). \"Head First Design Patterns\". O'Reilly Media.",
      "Samek, M. (2008). \"Practical UML Statecharts in C/C++, 2nd ed.\". Newnes."
    ],
    "typed_relations": [
      {
        "slug": "strategy-pattern",
        "type": "related"
      },
      {
        "slug": "command-pattern",
        "type": "related"
      },
      {
        "slug": "observer-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 235,
    "name": "Repository Pattern",
    "name_zh": "仓储模式",
    "slug": "repository-pattern",
    "category": "coding",
    "desc": "Mediate between domain model and data mapping layers using a collection-like interface",
    "desc_zh": "使用类集合接口在领域模型与数据映射层之间进行中介，解耦业务逻辑与数据访问",
    "steps": [
      "Define the repository interface in the domain layer: express data-access needs as domain-language methods (findById, findByEmail, save, remove) without any persistence technology details",
      "Implement a concrete repository: create an infrastructure-layer class that satisfies the interface using an ORM, query builder, or raw SQL, translating between domain objects and persistence records",
      "Use the repository in application services: inject the repository interface into use cases or application services; never let domain logic call persistence APIs directly",
      "Add a Unit of Work if needed: coordinate multiple repositories under a single transaction boundary so that all domain changes within a use case commit or roll back atomically",
      "Test with in-memory fakes: implement a lightweight in-memory version of the repository interface for unit tests, exercising domain logic without database dependencies"
    ],
    "steps_zh": [
      "在领域层定义仓储接口：用领域语言方法（findById、findByEmail、save、remove）表达数据访问需求，不涉及任何持久化技术细节",
      "实现具体仓储：创建基础设施层类，使用 ORM、查询构建器或原始 SQL 满足接口，在领域对象和持久化记录之间进行转换",
      "在应用服务中使用仓储：将仓储接口注入用例或应用服务；永远不让领域逻辑直接调用持久化 API",
      "如需要则添加工作单元：在单一事务边界下协调多个仓储，使一个用例内的所有领域更改原子性地提交或回滚",
      "使用内存仿冒对象进行测试：为仓储接口实现轻量级内存版本，在不依赖数据库的情况下测试领域逻辑"
    ],
    "ai_relevant": false,
    "viz_type": "tree",
    "viz_labels": [
      "Domain",
      "Repository Interface",
      "Concrete Repository",
      "Data Source"
    ],
    "viz_labels_zh": [
      "领域层",
      "仓储接口",
      "具体仓储",
      "数据源"
    ],
    "related": [
      "unit-of-work-pattern",
      "data-mapper-pattern",
      "dependency-injection",
      "active-record-pattern"
    ],
    "tags": [
      "fowler",
      "poeaa",
      "data-access",
      "domain-driven-design",
      "persistence"
    ],
    "origin_author": "Martin Fowler, 2002",
    "origin_source": "Patterns of Enterprise Application Architecture",
    "origin_source_zh": "《企业应用架构模式》",
    "complexity": "intermediate",
    "when_to_use": [
      "Domain logic must be unit-tested without a real database, requiring an abstraction over persistence",
      "The application may need to switch storage backends (SQL to NoSQL, or file to cloud) without changing domain code",
      "Centralizing query logic prevents the same complex query from being duplicated across multiple services or controllers"
    ],
    "when_to_use_zh": [
      "领域逻辑必须在没有真实数据库的情况下进行单元测试，需要对持久化进行抽象",
      "应用可能需要切换存储后端（SQL 到 NoSQL，或文件到云），而无需更改领域代码",
      "集中查询逻辑可防止相同的复杂查询在多个服务或控制器中重复"
    ],
    "core_concepts": [
      "Collection semantics: the repository presents domain objects as if they live in an in-memory collection; callers add, remove, and query without knowing the underlying store",
      "Domain language interface: method names reflect the ubiquitous language of the domain (findActiveCustomers, not SELECT * FROM customers WHERE active=1)",
      "Persistence ignorance: domain objects and application services are unaware of the database schema, ORM, or storage technology — only the concrete repository implementation knows"
    ],
    "core_concepts_zh": [
      "集合语义：仓储将领域对象呈现为存在于内存集合中；调用方可添加、删除和查询，而不知道底层存储",
      "领域语言接口：方法名反映领域的通用语言（findActiveCustomers，而非 SELECT * FROM customers WHERE active=1）",
      "持久化无知：领域对象和应用服务不知道数据库模式、ORM 或存储技术——只有具体仓储实现才知道"
    ],
    "timeline": [
      [
        "2002",
        "Martin Fowler formalizes Repository in PoEAA as a domain-layer pattern that abstracts the data mapping layer"
      ],
      [
        "2004",
        "Eric Evans' Domain-Driven Design elevates Repository to a core DDD tactical pattern, pairing it with Aggregate roots"
      ],
      [
        "2010",
        "Spring Data auto-generates Repository implementations from interface method names, drastically reducing boilerplate"
      ],
      [
        "2020",
        "Repository becomes standard in Clean Architecture and Hexagonal Architecture as the primary persistence port/adapter boundary"
      ]
    ],
    "timeline_zh": [
      [
        "2002",
        "Martin Fowler 在 PoEAA 中将仓储正式化为领域层模式，抽象数据映射层"
      ],
      [
        "2004",
        "Eric Evans 的《领域驱动设计》将仓储提升为核心 DDD 战术模式，与聚合根配对使用"
      ],
      [
        "2010",
        "Spring Data 从接口方法名自动生成仓储实现，大幅减少样板代码"
      ],
      [
        "2020",
        "仓储成为整洁架构和六边形架构中的标准，作为主要的持久化端口/适配器边界"
      ]
    ],
    "dos": [
      "Do define repository interfaces in the domain layer and implementations in the infrastructure layer to enforce the dependency rule",
      "Do use repositories per Aggregate root only — querying across aggregate boundaries should go through a dedicated query service or read model",
      "Do provide in-memory repository fakes alongside production implementations so teams can write fast, database-free unit tests"
    ],
    "dos_zh": [
      "在领域层定义仓储接口，在基础设施层定义实现，以强制执行依赖规则",
      "仅对聚合根使用仓储——跨聚合边界的查询应通过专用查询服务或读模型进行",
      "提供内存仓储仿冒对象与生产实现并存，使团队能够编写快速、无数据库依赖的单元测试"
    ],
    "donts": [
      "Don't expose IQueryable or database-specific query builders through the repository interface — it leaks the persistence abstraction into the domain",
      "Don't create one mega-repository for the entire application; scope each repository to its aggregate root to keep responsibilities focused",
      "Don't bypass the repository by directly calling an ORM or SQL from application services — it defeats the abstraction and couples domain logic to infrastructure"
    ],
    "donts_zh": [
      "不要通过仓储接口暴露 IQueryable 或数据库特定的查询构建器——这会将持久化抽象泄漏到领域中",
      "不要为整个应用创建一个巨型仓储；将每个仓储限定到其聚合根，保持职责专注",
      "不要通过直接从应用服务调用 ORM 或 SQL 来绕过仓储——这会破坏抽象并将领域逻辑与基础设施耦合"
    ],
    "case_study_company": "Spring Data / Pivotal",
    "case_study": "Spring Data's CrudRepository and JpaRepository interfaces are the most widely used Repository pattern implementation in the Java ecosystem. Developers declare an interface extending JpaRepository<Customer, Long>, annotate query methods with @Query or follow naming conventions like findByEmailAndActiveTrue(), and Spring Data generates the full implementation at startup. This eliminates thousands of lines of JDBC/JPA boilerplate, while the interface boundary allows teams to swap Spring Data JPA for Spring Data MongoDB or an in-memory fake without changing a single line of service code.",
    "case_study_zh": "Spring Data 的 CrudRepository 和 JpaRepository 接口是 Java 生态系统中使用最广泛的仓储模式实现。开发者声明一个扩展 JpaRepository<Customer, Long> 的接口，用 @Query 注解查询方法或遵循 findByEmailAndActiveTrue() 等命名约定，Spring Data 在启动时生成完整实现。这消除了数千行 JDBC/JPA 样板代码，而接口边界允许团队将 Spring Data JPA 换成 Spring Data MongoDB 或内存仿冒对象，而无需更改任何服务代码。",
    "when_not_to_use": [
      "Simple CRUD applications with no domain logic where an Active Record pattern or direct ORM calls are simpler and equally testable",
      "Read-heavy reporting scenarios where complex JOIN queries benefit from direct SQL or a dedicated CQRS read model rather than repository abstractions",
      "Microservices that own a single table with trivial access patterns where the repository indirection adds no value"
    ],
    "when_not_to_use_zh": [
      "没有领域逻辑的简单 CRUD 应用，活动记录模式或直接 ORM 调用更简单且同样可测试",
      "读密集型报告场景，复杂 JOIN 查询受益于直接 SQL 或专用 CQRS 读模型，而非仓储抽象",
      "拥有单个表且访问模式简单的微服务，仓储间接层没有带来任何价值"
    ],
    "adopters": [
      "Spring Data (Java)",
      "Entity Framework Core (.NET)",
      "Laravel Eloquent (PHP)",
      "TypeORM / MikroORM (Node.js)"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "testability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Fowler, M. (2002). \"Patterns of Enterprise Application Architecture\". Addison-Wesley.",
    "secondary_sources": [
      "Evans, E. (2003). \"Domain-Driven Design: Tackling Complexity in the Heart of Software\". Addison-Wesley.",
      "Vernon, V. (2013). \"Implementing Domain-Driven Design\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "unit-of-work-pattern",
        "type": "complement"
      },
      {
        "slug": "data-mapper-pattern",
        "type": "complement"
      },
      {
        "slug": "dependency-injection",
        "type": "complement"
      },
      {
        "slug": "active-record-pattern",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 236,
    "name": "Unit of Work Pattern",
    "name_zh": "工作单元模式",
    "slug": "unit-of-work-pattern",
    "category": "coding",
    "desc": "Track object changes during a business transaction and commit them as a single atomic batch",
    "desc_zh": "在业务事务期间跟踪对象变更，并将其作为单一原子批次提交到数据库",
    "steps": [
      "Open a Unit of Work: begin a new UoW instance at the start of each business operation, representing a single logical transaction boundary",
      "Register changes: as domain operations occur, register new, dirty (modified), and removed objects with the UoW rather than writing to the database immediately",
      "Coordinate repositories: repositories created within the same UoW share its identity map and change tracker to ensure consistency across aggregate boundaries",
      "Commit atomically: when the business operation completes successfully, call commit() to flush all changes in a single database transaction — insert new, update dirty, delete removed",
      "Roll back on failure: if any step throws, call rollback() to discard all tracked changes and leave the database in its pre-operation state"
    ],
    "steps_zh": [
      "打开工作单元：在每个业务操作开始时创建一个新的 UoW 实例，代表单一的逻辑事务边界",
      "注册变更：随着领域操作的发生，将新建的、脏的（已修改的）和已删除的对象注册到 UoW，而非立即写入数据库",
      "协调仓储：在同一 UoW 内创建的仓储共享其身份映射和变更跟踪器，确保跨聚合边界的一致性",
      "原子提交：当业务操作成功完成时，调用 commit() 在单一数据库事务中刷新所有变更——插入新建、更新脏的、删除已删除的",
      "失败时回滚：如果任何步骤抛出异常，调用 rollback() 丢弃所有跟踪的变更，使数据库保持操作前的状态"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Register New",
      "Register Dirty",
      "Register Deleted",
      "Commit"
    ],
    "viz_labels_zh": [
      "注册新增",
      "注册修改",
      "注册删除",
      "提交事务"
    ],
    "related": [
      "repository-pattern",
      "data-mapper-pattern",
      "solid-principles"
    ],
    "tags": [
      "fowler",
      "poeaa",
      "transactions",
      "data-access",
      "persistence"
    ],
    "origin_author": "Martin Fowler, 2002",
    "origin_source": "Patterns of Enterprise Application Architecture",
    "origin_source_zh": "《企业应用架构模式》",
    "complexity": "intermediate",
    "when_to_use": [
      "A single business operation modifies multiple domain objects or aggregates that must all succeed or fail together",
      "You need to batch database writes for performance, reducing round-trips from N individual inserts to one transaction",
      "Preventing partial updates that would leave the system in an inconsistent state if an error occurs mid-operation"
    ],
    "when_to_use_zh": [
      "单一业务操作修改多个领域对象或聚合，它们必须全部成功或全部失败",
      "需要批量数据库写入以提升性能，将 N 个单独插入减少到一个事务",
      "防止在操作中途发生错误时留下不一致状态的部分更新"
    ],
    "core_concepts": [
      "Change tracking: the UoW maintains three lists — new objects (to INSERT), dirty objects (to UPDATE), and removed objects (to DELETE) — populated as domain operations mutate state",
      "Identity map: a cache keyed by entity ID within the UoW scope ensures the same database row is loaded only once per transaction, preventing conflicting in-memory copies",
      "Atomic commit: all registered changes are flushed inside a single database transaction; if the transaction fails the UoW discards the identity map and change lists"
    ],
    "core_concepts_zh": [
      "变更跟踪：UoW 维护三个列表——新对象（需 INSERT）、脏对象（需 UPDATE）和已删除对象（需 DELETE）——在领域操作改变状态时填充",
      "身份映射：在 UoW 范围内以实体 ID 为键的缓存确保每个事务只加载一次相同的数据库行，防止内存中的冲突副本",
      "原子提交：所有注册的变更在单一数据库事务中刷新；如果事务失败，UoW 丢弃身份映射和变更列表"
    ],
    "timeline": [
      [
        "2002",
        "Martin Fowler formalizes Unit of Work in PoEAA, describing it as a pattern already implemented by ORMs like TopLink"
      ],
      [
        "2006",
        "Hibernate's Session and .NET's DataContext implement UoW transparently, making the pattern invisible but ubiquitous"
      ],
      [
        "2012",
        "Entity Framework DbContext becomes the canonical .NET UoW implementation, combining repository and UoW in one object"
      ],
      [
        "2018",
        "CQRS architectures use explicit UoW for the write side while the read side bypasses it entirely for performance"
      ]
    ],
    "timeline_zh": [
      [
        "2002",
        "Martin Fowler 在 PoEAA 中将工作单元正式化，描述它为 TopLink 等 ORM 已实现的模式"
      ],
      [
        "2006",
        "Hibernate 的 Session 和 .NET 的 DataContext 透明地实现了 UoW，使该模式不可见但无处不在"
      ],
      [
        "2012",
        "Entity Framework 的 DbContext 成为规范的 .NET UoW 实现，将仓储和 UoW 合并在一个对象中"
      ],
      [
        "2018",
        "CQRS 架构在写端使用显式 UoW，读端为了性能完全绕过它"
      ]
    ],
    "dos": [
      "Do scope each Unit of Work to a single business request or use case — long-lived UoW sessions accumulate stale state and lock database rows unnecessarily",
      "Do let the application service control commit/rollback so that transaction boundaries align with business operation boundaries, not individual repository calls",
      "Do combine Unit of Work with the Repository pattern so repositories share the same UoW session and changes are flushed together"
    ],
    "dos_zh": [
      "将每个工作单元限定在单一业务请求或用例上——长期存活的 UoW 会话会积累过时状态并不必要地锁定数据库行",
      "让应用服务控制提交/回滚，使事务边界与业务操作边界对齐，而非单个仓储调用",
      "将工作单元与仓储模式结合使用，使仓储共享同一 UoW 会话，变更一起刷新"
    ],
    "donts": [
      "Don't use a single UoW for an entire user session or HTTP request that spans multiple unrelated business operations — this creates large, hard-to-debug transactions",
      "Don't call commit() multiple times within a single UoW; design the boundary so one business operation maps to one commit",
      "Don't expose the UoW directly to domain objects — domain logic should not know or care about transaction management"
    ],
    "donts_zh": [
      "不要对跨越多个不相关业务操作的整个用户会话或 HTTP 请求使用单一 UoW——这会创建大型、难以调试的事务",
      "不要在单一 UoW 中多次调用 commit()；设计边界使一个业务操作映射到一次提交",
      "不要将 UoW 直接暴露给领域对象——领域逻辑不应知道或关心事务管理"
    ],
    "case_study_company": "Entity Framework / Microsoft",
    "case_study": "Entity Framework Core's DbContext is the most widely used Unit of Work implementation in the .NET ecosystem. When an ASP.NET controller action updates a customer's address and creates an order in the same request, both operations are performed on in-memory entity objects tracked by a single DbContext. Only when SaveChanges() is called does EF Core analyze the change tracker, generate the minimal SQL (UPDATE + INSERT), and execute both statements inside a single transaction. If the INSERT fails, the entire transaction rolls back, leaving no partial state — exactly the atomicity guarantee Unit of Work provides.",
    "case_study_zh": "Entity Framework Core 的 DbContext 是 .NET 生态系统中使用最广泛的工作单元实现。当 ASP.NET 控制器操作在同一请求中更新客户地址并创建订单时，两个操作都在由单一 DbContext 跟踪的内存实体对象上执行。只有调用 SaveChanges() 时，EF Core 才分析变更跟踪器，生成最小 SQL（UPDATE + INSERT），并在单一事务中执行两条语句。如果 INSERT 失败，整个事务回滚，不留下任何部分状态——这正是工作单元提供的原子性保证。",
    "when_not_to_use": [
      "Event-sourced systems where the write model stores immutable events rather than mutable entity state, making change tracking irrelevant",
      "Read-only query services that never modify state and have no need for transaction coordination",
      "Simple single-entity operations where a single repository save within its own transaction is sufficient"
    ],
    "when_not_to_use_zh": [
      "事件溯源系统，写模型存储不可变事件而非可变实体状态，使变更跟踪变得无关",
      "从不修改状态且不需要事务协调的只读查询服务",
      "简单的单实体操作，在其自身事务中单个仓储保存就足够了"
    ],
    "adopters": [
      "Entity Framework Core (.NET)",
      "Hibernate / JPA (Java)",
      "SQLAlchemy (Python)",
      "Doctrine ORM (PHP)"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Fowler, M. (2002). \"Patterns of Enterprise Application Architecture\". Addison-Wesley.",
    "secondary_sources": [
      "Evans, E. (2003). \"Domain-Driven Design: Tackling Complexity in the Heart of Software\". Addison-Wesley.",
      "Haack, P. (2011). \"Repository and Unit of Work Pattern\". haacked.com."
    ],
    "typed_relations": [
      {
        "slug": "repository-pattern",
        "type": "complement"
      },
      {
        "slug": "data-mapper-pattern",
        "type": "complement"
      },
      {
        "slug": "solid-principles",
        "type": "complement"
      }
    ]
  },
  {
    "id": 237,
    "name": "Data Mapper Pattern",
    "name_zh": "数据映射器模式",
    "slug": "data-mapper-pattern",
    "category": "coding",
    "desc": "Transfer data between in-memory objects and a database while keeping them independent of each other",
    "desc_zh": "在内存对象与数据库之间传输数据，同时保持两者相互独立，领域对象对持久化完全无知",
    "steps": [
      "Design persistence-ignorant domain objects: model domain classes using only business logic and in-memory state, with no database annotations, base class inheritance, or SQL awareness",
      "Create the Mapper class: implement a dedicated class (or ORM mapping configuration) responsible for SELECT, INSERT, UPDATE, DELETE and for translating between database rows and domain objects",
      "Map columns to fields: define explicit mappings from table columns to object properties, handling type conversions, naming differences, and value object composition",
      "Handle identity and lazy loading: the mapper manages object identity (avoiding duplicate in-memory instances for the same row) and can support lazy-loading of associations via proxies",
      "Integrate with Unit of Work: register loaded and mutated objects with a Unit of Work so the mapper's write operations are batched into a single transaction on commit"
    ],
    "steps_zh": [
      "设计持久化无知的领域对象：使用仅包含业务逻辑和内存状态的领域类，没有数据库注解、基类继承或 SQL 意识",
      "创建映射器类：实现专用类（或 ORM 映射配置），负责 SELECT、INSERT、UPDATE、DELETE 以及在数据库行和领域对象之间进行转换",
      "将列映射到字段：定义从表列到对象属性的显式映射，处理类型转换、命名差异和值对象组合",
      "处理标识和懒加载：映射器管理对象标识（避免同一行在内存中有重复实例），并可通过代理支持关联的懒加载",
      "与工作单元集成：将加载的和变更的对象注册到工作单元，使映射器的写操作在提交时批处理到单一事务中"
    ],
    "ai_relevant": false,
    "viz_type": "tree",
    "viz_labels": [
      "Domain Object",
      "Data Mapper",
      "Database Row",
      "Identity Map"
    ],
    "viz_labels_zh": [
      "领域对象",
      "数据映射器",
      "数据库记录",
      "标识映射"
    ],
    "related": [
      "repository-pattern",
      "unit-of-work-pattern",
      "solid-principles",
      "active-record-pattern"
    ],
    "tags": [
      "fowler",
      "poeaa",
      "data-access",
      "orm",
      "persistence"
    ],
    "origin_author": "Martin Fowler, 2002",
    "origin_source": "Patterns of Enterprise Application Architecture",
    "origin_source_zh": "《企业应用架构模式》",
    "complexity": "advanced",
    "when_to_use": [
      "Domain objects are complex and their structure diverges significantly from the relational schema (impedance mismatch)",
      "Domain objects must remain free of persistence concerns so they can be tested, serialized, or reused in non-database contexts",
      "The database schema is legacy or independently owned and cannot be changed to match the object model"
    ],
    "when_to_use_zh": [
      "领域对象很复杂，其结构与关系模式显著不同（阻抗不匹配）",
      "领域对象必须保持对持久化关注点的自由，以便在非数据库上下文中测试、序列化或重用",
      "数据库模式是遗留的或独立拥有的，无法更改以匹配对象模型"
    ],
    "core_concepts": [
      "Persistence ignorance: domain objects know nothing about the database — no SQL, no annotations, no base ORM classes — making them pure expressions of business rules",
      "Bidirectional translation: the mapper reads rows and constructs domain objects (hydration), and reads domain objects to produce SQL (dehydration), keeping both sides clean",
      "Impedance mismatch resolution: the mapper bridges structural differences between the relational world (tables, joins, foreign keys) and the object world (inheritance, associations, value objects)"
    ],
    "core_concepts_zh": [
      "持久化无知：领域对象对数据库一无所知——没有 SQL、没有注解、没有 ORM 基类——使其成为业务规则的纯粹表达",
      "双向转换：映射器读取行并构造领域对象（水合），以及读取领域对象生成 SQL（脱水），保持两侧整洁",
      "阻抗不匹配解决：映射器桥接关系世界（表、连接、外键）与对象世界（继承、关联、值对象）之间的结构差异"
    ],
    "timeline": [
      [
        "2002",
        "Fowler formalizes Data Mapper in PoEAA, contrasting it with Active Record and noting its greater complexity but higher isolation"
      ],
      [
        "2006",
        "Hibernate becomes the dominant Java Data Mapper ORM, inspiring similar frameworks in Python (SQLAlchemy) and PHP (Doctrine)"
      ],
      [
        "2010",
        "NHibernate and then Entity Framework bring Data Mapper semantics to .NET, with EF eventually adding Code First and fluent mapping APIs"
      ],
      [
        "2020",
        "Micro-ORMs like Dapper show that hand-written SQL with explicit object mapping can outperform full ORM mappers for read-heavy workloads"
      ]
    ],
    "timeline_zh": [
      [
        "2002",
        "Fowler 在 PoEAA 中将数据映射器正式化，将其与活动记录对比，注意到其更高的复杂性但也更高的隔离性"
      ],
      [
        "2006",
        "Hibernate 成为主流 Java 数据映射器 ORM，启发了 Python（SQLAlchemy）和 PHP（Doctrine）中的类似框架"
      ],
      [
        "2010",
        "NHibernate 以及后来的 Entity Framework 将数据映射器语义引入 .NET，EF 最终添加了 Code First 和流式映射 API"
      ],
      [
        "2020",
        "Dapper 等微型 ORM 表明，对于读密集型工作负载，手写 SQL 配合显式对象映射可以优于完整 ORM 映射器"
      ]
    ],
    "dos": [
      "Do keep domain classes free of any persistence annotations or base class requirements — pure POJOs/POCOs enable the highest degree of testability",
      "Do version and test the mapping configurations independently to catch schema-migration regressions before they reach production",
      "Do consider using a micro-ORM with explicit SQL for read models where the full Data Mapper overhead is unnecessary"
    ],
    "dos_zh": [
      "保持领域类没有任何持久化注解或基类要求——纯 POJO/POCO 实现最高程度的可测试性",
      "独立对映射配置进行版本控制和测试，在模式迁移回归到达生产环境之前捕获它们",
      "考虑对读模型使用带显式 SQL 的微型 ORM，在完整数据映射器开销不必要的地方"
    ],
    "donts": [
      "Don't let the mapper bleed into the domain layer — mapping concerns (RowMapper, ResultSetExtractor) belong exclusively in the infrastructure layer",
      "Don't use Data Mapper for simple, table-per-class CRUD entities where Active Record provides the same isolation with far less ceremony",
      "Don't hand-write mappers when a mature ORM covers your schema — manual mappers require significant effort to maintain through schema changes"
    ],
    "donts_zh": [
      "不要让映射器渗入领域层——映射关注点（RowMapper、ResultSetExtractor）专属于基础设施层",
      "不要将数据映射器用于简单的、每类一表的 CRUD 实体，活动记录以更少的繁文缛节提供相同的隔离",
      "当成熟的 ORM 涵盖你的模式时，不要手写映射器——手动映射器需要大量精力来维护模式变更"
    ],
    "case_study_company": "Hibernate / Red Hat",
    "case_study": "Hibernate is the most influential Data Mapper implementation, used by millions of Java applications. A Customer domain class has no SQL, no base class, and no Hibernate imports — it is a plain Java object. Hibernate's mapping configuration (XML or JPA annotations in a separate orm.xml) translates between the CUSTOMERS table and the Customer class, handling the one-to-many relationship to Orders, lazy-loading Address as a value object, and managing the identity map to ensure that loading Customer#42 twice in the same session returns the same Java object. This separation allowed Netflix and LinkedIn to evolve domain models and database schemas independently during rapid growth.",
    "case_study_zh": "Hibernate 是最具影响力的数据映射器实现，被数百万 Java 应用使用。Customer 领域类没有 SQL、没有基类、没有 Hibernate 导入——它是一个普通的 Java 对象。Hibernate 的映射配置（单独的 orm.xml 中的 XML 或 JPA 注解）在 CUSTOMERS 表和 Customer 类之间进行转换，处理与 Orders 的一对多关系，将 Address 懒加载为值对象，并管理身份映射以确保在同一会话中两次加载 Customer#42 返回同一 Java 对象。这种分离使 Netflix 和 LinkedIn 能够在快速增长期间独立演进领域模型和数据库模式。",
    "when_not_to_use": [
      "Simple applications where domain classes map one-to-one with tables and Active Record or table gateway is significantly simpler",
      "Read-only reporting or analytics queries where raw SQL with a thin result mapper is more performant and easier to optimize",
      "Microservices with a single, small schema that owns fewer than a dozen entities, where the indirection of a full mapper adds little value"
    ],
    "when_not_to_use_zh": [
      "领域类与表一一对应的简单应用，活动记录或表网关明显更简单",
      "只读报告或分析查询，带薄结果映射器的原始 SQL 性能更好且更易于优化",
      "拥有不到十几个实体的单一小型模式的微服务，完整映射器的间接层几乎没有价值"
    ],
    "adopters": [
      "Hibernate (Java)",
      "SQLAlchemy (Python)",
      "Doctrine ORM (PHP)",
      "Entity Framework Core (.NET)"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "testability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Fowler, M. (2002). \"Patterns of Enterprise Application Architecture\". Addison-Wesley.",
    "secondary_sources": [
      "Bauer, C., King, G. (2006). \"Java Persistence with Hibernate\". Manning Publications.",
      "Evans, E. (2003). \"Domain-Driven Design: Tackling Complexity in the Heart of Software\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "repository-pattern",
        "type": "complement"
      },
      {
        "slug": "unit-of-work-pattern",
        "type": "complement"
      },
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "active-record-pattern",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 238,
    "name": "Builder Pattern",
    "name_zh": "构建器模式",
    "slug": "builder-pattern",
    "category": "coding",
    "desc": "Construct complex objects step by step using a fluent API, separating construction from representation",
    "desc_zh": "使用流式 API 逐步构建复杂对象，将构建过程与对象表示分离，支持多种表示形式",
    "steps": [
      "Identify the complex object: target classes with many constructor parameters (especially optional ones), telescoping constructors, or objects that require multi-step initialization",
      "Create the Builder class: implement a separate Builder (often a static inner class) with a setter method for each configurable attribute, each returning the Builder itself for chaining",
      "Add validation in build(): implement a terminal build() method that validates that required fields are set, assembles the final immutable object, and throws if preconditions are violated",
      "Make the target object immutable: construct the target class so it only accepts a fully built Builder instance, storing all values as final fields accessible via getters",
      "Introduce a Director (optional): add a Director class for commonly used configurations so clients can produce standard products (e.g., buildMinimalConfig(), buildProductionConfig()) without repeating steps"
    ],
    "steps_zh": [
      "识别复杂对象：针对具有许多构造函数参数（尤其是可选参数）、伸缩式构造函数或需要多步初始化的类",
      "创建构建器类：实现一个单独的 Builder（通常是静态内部类），为每个可配置属性提供 setter 方法，每个方法返回 Builder 本身以支持链式调用",
      "在 build() 中添加验证：实现终结方法 build()，验证必填字段已设置，组装最终不可变对象，如违反前置条件则抛出异常",
      "使目标对象不可变：构造目标类使其只接受完全构建的 Builder 实例，将所有值存储为可通过 getter 访问的 final 字段",
      "引入 Director（可选）：为常用配置添加 Director 类，使客户端可以生产标准产品（如 buildMinimalConfig()、buildProductionConfig()）而无需重复步骤"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Director",
      "Builder Interface",
      "Concrete Builder",
      "Product"
    ],
    "viz_labels_zh": [
      "指挥者",
      "建造者接口",
      "具体建造者",
      "产品"
    ],
    "related": [
      "factory-method-pattern",
      "solid-principles",
      "strategy-pattern"
    ],
    "tags": [
      "gof",
      "creational",
      "design-patterns",
      "fluent-api",
      "immutability"
    ],
    "origin_author": "Gamma, Helm, Johnson, Vlissides, 1994",
    "origin_source": "Design Patterns: Elements of Reusable Object-Oriented Software (modernized with fluent API)",
    "origin_source_zh": "《设计模式：可复用面向对象软件的基础》（以流式 API 现代化）",
    "complexity": "beginner",
    "when_to_use": [
      "A class has four or more constructor parameters, especially when several are optional, making call sites ambiguous and error-prone",
      "The same construction process should produce different representations (e.g., HTML vs PDF report builders)",
      "Object initialization requires a specific sequence of steps or validation logic that must not be bypassed"
    ],
    "when_to_use_zh": [
      "一个类有四个或更多构造函数参数，尤其是其中几个是可选的，使调用点模糊且容易出错",
      "相同的构建过程应该生产不同的表示（如 HTML vs PDF 报告构建器）",
      "对象初始化需要特定的步骤顺序或验证逻辑，不能被绕过"
    ],
    "core_concepts": [
      "Step-by-step construction: the Builder accumulates configuration through a series of named setter calls, making each parameter's purpose explicit at the call site",
      "Fluent interface: each setter returns the Builder instance, enabling a readable method-chain syntax that reads like natural language (new PersonBuilder().name(\"Alice\").age(30).build())",
      "Separation of construction and representation: the Builder isolates complex assembly logic from the product class itself, allowing multiple Builder implementations to produce different product variants"
    ],
    "core_concepts_zh": [
      "逐步构建：构建器通过一系列命名 setter 调用积累配置，使每个参数的用途在调用点显式明确",
      "流式接口：每个 setter 返回 Builder 实例，支持可读的方法链语法，读起来像自然语言（new PersonBuilder().name(\"Alice\").age(30).build()）",
      "构建与表示分离：Builder 将复杂的组装逻辑与产品类本身隔离，允许多个 Builder 实现生产不同的产品变体"
    ],
    "timeline": [
      [
        "1994",
        "GoF introduce Builder as a creational pattern focused on step-by-step construction with an optional Director to control sequencing"
      ],
      [
        "2001",
        "Joshua Bloch popularizes the fluent Builder inner-class idiom in Effective Java as a solution to telescoping constructors"
      ],
      [
        "2009",
        "Lombok's @Builder annotation auto-generates fluent Builder code in Java, making the pattern near-zero boilerplate"
      ],
      [
        "2020",
        "Kotlin data classes with named parameters and default values reduce the need for Builder in modern JVM code, while Swift uses the pattern for SwiftUI view modifiers"
      ]
    ],
    "timeline_zh": [
      [
        "1994",
        "GoF 将构建器作为创建型模式引入，专注于逐步构建，可选 Director 控制顺序"
      ],
      [
        "2001",
        "Joshua Bloch 在《Effective Java》中推广流式构建器内部类习语，作为伸缩式构造函数的解决方案"
      ],
      [
        "2009",
        "Lombok 的 @Builder 注解在 Java 中自动生成流式构建器代码，使该模式近乎零样板"
      ],
      [
        "2020",
        "Kotlin 的命名参数和默认值数据类减少了现代 JVM 代码中对构建器的需求，而 Swift 在 SwiftUI 视图修饰符中使用该模式"
      ]
    ],
    "dos": [
      "Do make the built product immutable by storing all fields as final and providing only getters — the Builder is the sole mutation point",
      "Do validate field combinations in build() rather than in individual setters, so validation runs once against the complete configuration",
      "Do provide sensible defaults for optional fields in the Builder so callers only need to specify what differs from the norm"
    ],
    "dos_zh": [
      "通过将所有字段存储为 final 并只提供 getter 使构建的产品不可变——构建器是唯一的变更点",
      "在 build() 中验证字段组合而非在单个 setter 中，使验证在完整配置上运行一次",
      "在 Builder 中为可选字段提供合理的默认值，使调用方只需指定与规范不同的内容"
    ],
    "donts": [
      "Don't use Builder for simple objects with one or two required fields — it adds unnecessary ceremony where a plain constructor is clearer",
      "Don't allow the Builder to be reused after build() is called without a clear reset; reuse leads to accidental shared state between product instances",
      "Don't skip the terminal build() validation step — returning a partially configured object silently is worse than a clear exception at construction time"
    ],
    "donts_zh": [
      "不要对具有一两个必填字段的简单对象使用构建器——在普通构造函数更清晰的地方增加了不必要的繁文缛节",
      "不要在 build() 调用后允许 Builder 重用而不明确重置；重用会导致产品实例之间意外的共享状态",
      "不要跳过终结 build() 验证步骤——静默返回部分配置的对象比构建时的明确异常更糟糕"
    ],
    "case_study_company": "OkHttp / Square",
    "case_study": "OkHttp, the widely used Android and Java HTTP client, uses Builder throughout its API. OkHttpClient is constructed via OkHttpClient.Builder with chainable calls to set timeouts, interceptors, cache, SSL certificates, and proxy configuration. Request objects are built via Request.Builder with URL, method, headers, and body. Neither OkHttpClient nor Request can be constructed directly — only the Builder exposes setters, and build() produces immutable instances. This design made OkHttp's API self-documenting, prevented half-initialized client objects, and enabled Square's team to add new configuration options across major versions without breaking existing call sites.",
    "case_study_zh": "OkHttp（广泛使用的 Android 和 Java HTTP 客户端）在其 API 中全面使用构建器。OkHttpClient 通过 OkHttpClient.Builder 构建，可链式调用设置超时、拦截器、缓存、SSL 证书和代理配置。Request 对象通过 Request.Builder 构建，包含 URL、方法、头部和正文。OkHttpClient 和 Request 均不能直接构建——只有 Builder 暴露 setter，build() 生产不可变实例。这种设计使 OkHttp 的 API 自文档化，防止了半初始化的客户端对象，并使 Square 团队能够在主要版本中添加新配置选项而不破坏现有调用点。",
    "when_not_to_use": [
      "Value objects with only one or two fields where a plain constructor with named parameters is more readable",
      "Mutable objects that change state frequently after construction — Builder is optimized for producing final, immutable products",
      "Performance-critical tight loops where the extra Builder object allocation per product is a measurable bottleneck"
    ],
    "when_not_to_use_zh": [
      "只有一两个字段的值对象，带命名参数的普通构造函数更具可读性",
      "构建后频繁改变状态的可变对象——构建器针对生产最终不可变产品进行了优化",
      "性能关键的紧循环，每个产品额外的 Builder 对象分配是可测量的瓶颈"
    ],
    "adopters": [
      "OkHttp / Retrofit (Square)",
      "Lombok @Builder (Java)",
      "SwiftUI (view modifiers)",
      "Apache HttpClient"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "usability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Gamma, E., Helm, R., Johnson, R., Vlissides, J. (1994). \"Design Patterns: Elements of Reusable Object-Oriented Software\". Addison-Wesley.",
    "secondary_sources": [
      "Bloch, J. (2008). \"Effective Java, 2nd ed.\". Addison-Wesley.",
      "Freeman, E., Robson, E. (2004). \"Head First Design Patterns\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "factory-method-pattern",
        "type": "related"
      },
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "strategy-pattern",
        "type": "related"
      }
    ]
  },
  {
    "id": 239,
    "name": "Middleware / Pipeline Pattern",
    "name_zh": "中间件/管道模式",
    "slug": "middleware-pipeline-pattern",
    "category": "coding",
    "desc": "Chain processing steps that can inspect, transform, or short-circuit a request as it flows through a pipeline",
    "desc_zh": "将处理步骤链接成管道，每个步骤可检查、转换请求或短路请求的流动，实现横切关注点的解耦",
    "steps": [
      "Define the middleware interface: establish a contract where each middleware receives the request/context and a next function to invoke the remaining pipeline",
      "Implement individual middleware functions: each middleware performs one focused concern (authentication, logging, compression, rate limiting) and calls next() to pass control downstream",
      "Compose the pipeline: register middleware in order using an app.use() style API, a list, or a builder; the framework or pipeline runner chains them so each wraps the next",
      "Handle short-circuiting: allow middleware to terminate the pipeline early (e.g., return 401 without calling next()) for cross-cutting rejection or caching scenarios",
      "Support bi-directional flow (onion model): design middleware to execute logic both before calling next() (inbound/request phase) and after it returns (outbound/response phase) for symmetric concerns like timing and error wrapping"
    ],
    "steps_zh": [
      "定义中间件接口：建立每个中间件接收请求/上下文和调用剩余管道的 next 函数的契约",
      "实现各个中间件函数：每个中间件执行一个专注的关注点（认证、日志、压缩、速率限制），并调用 next() 将控制权传递给下游",
      "组合管道：使用 app.use() 风格 API、列表或构建器按顺序注册中间件；框架或管道运行器将它们链接起来，使每个中间件包装下一个",
      "处理短路：允许中间件提前终止管道（如不调用 next() 直接返回 401），用于横切拒绝或缓存场景",
      "支持双向流（洋葱模型）：设计中间件在调用 next() 之前（入站/请求阶段）和之后（出站/响应阶段）都执行逻辑，用于时间和错误包装等对称关注点"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Request",
      "Middleware",
      "Handler",
      "Response"
    ],
    "viz_labels_zh": [
      "请求",
      "中间件",
      "处理器",
      "响应"
    ],
    "related": [
      "decorator-pattern",
      "command-pattern"
    ],
    "tags": [
      "pipeline",
      "middleware",
      "behavioral",
      "http",
      "cross-cutting-concerns"
    ],
    "origin_author": "Community practice; popularized by TJ Holowaychuk (Express.js / Koa), 2010s",
    "origin_source": "Express.js, Koa, ASP.NET Core, and WSGI community practice",
    "origin_source_zh": "Express.js、Koa、ASP.NET Core 及 WSGI 社区实践",
    "complexity": "intermediate",
    "when_to_use": [
      "Cross-cutting concerns (auth, logging, tracing, rate limiting, caching) need to be applied uniformly across many request handlers without polluting handler code",
      "Request processing involves a well-defined, ordered sequence of transformations where each step is independently testable and replaceable",
      "Building extensible frameworks or plugins systems where users can inject behavior into the processing flow without modifying core code"
    ],
    "when_to_use_zh": [
      "横切关注点（认证、日志、追踪、速率限制、缓存）需要统一应用于许多请求处理程序，而不污染处理程序代码",
      "请求处理涉及定义明确的有序转换序列，每个步骤独立可测试且可替换",
      "构建可扩展框架或插件系统，用户可以在不修改核心代码的情况下将行为注入处理流程"
    ],
    "core_concepts": [
      "Middleware function: a unit of pipeline logic that receives the context and a next() function; it can inspect or mutate context, delegate to next(), and optionally process the result on the way back",
      "Pipeline composition: middleware functions are stacked at configuration time into a chain; at runtime each function wraps and delegates to the remaining chain, creating the onion model",
      "Short-circuit termination: middleware can halt the pipeline by not calling next(), returning a synthetic response directly — enabling authentication guards, cache hits, and error fences without touching downstream handlers"
    ],
    "core_concepts_zh": [
      "中间件函数：管道逻辑的单元，接收上下文和 next() 函数；可以检查或改变上下文，委托给 next()，并可选地在返回途中处理结果",
      "管道组合：中间件函数在配置时堆叠成链；在运行时每个函数包装并委托给剩余链，创建洋葱模型",
      "短路终止：中间件可以通过不调用 next() 来中止管道，直接返回合成响应——使认证守卫、缓存命中和错误隔离无需触及下游处理程序"
    ],
    "timeline": [
      [
        "2000",
        "Python's WSGI specification defines a middleware-composable interface for web applications, predating popular Node.js frameworks"
      ],
      [
        "2010",
        "TJ Holowaychuk releases Express.js for Node.js, popularizing app.use() middleware as the dominant pattern for HTTP pipeline composition"
      ],
      [
        "2013",
        "Koa.js introduces the generator-based (later async/await) onion model, making bidirectional middleware first-class"
      ],
      [
        "2016",
        "ASP.NET Core rebuilds its HTTP pipeline around IMiddleware and app.Use(), bringing the Express idiom to the .NET ecosystem with dependency injection support"
      ],
      [
        "2022",
        "AI agent frameworks (LangChain, LlamaIndex) adopt pipeline/middleware patterns for LLM request chains, tool routing, and response post-processing"
      ]
    ],
    "timeline_zh": [
      [
        "2000",
        "Python 的 WSGI 规范为 Web 应用定义了中间件可组合接口，早于流行的 Node.js 框架"
      ],
      [
        "2010",
        "TJ Holowaychuk 为 Node.js 发布 Express.js，推广 app.use() 中间件作为 HTTP 管道组合的主流模式"
      ],
      [
        "2013",
        "Koa.js 引入基于生成器（后来是 async/await）的洋葱模型，使双向中间件成为一等公民"
      ],
      [
        "2016",
        "ASP.NET Core 围绕 IMiddleware 和 app.Use() 重建其 HTTP 管道，将 Express 习语带入 .NET 生态系统并支持依赖注入"
      ],
      [
        "2022",
        "AI 智能体框架（LangChain、LlamaIndex）采用管道/中间件模式用于 LLM 请求链、工具路由和响应后处理"
      ]
    ],
    "dos": [
      "Do keep each middleware focused on a single concern — authentication middleware should not also perform logging; compose separate middlewares instead",
      "Do always call next() or explicitly terminate; forgetting next() in non-terminating middleware silently hangs requests and is one of the most common pipeline bugs",
      "Do order middleware deliberately — security and authentication checks must precede business logic; logging middleware should wrap the entire pipeline to capture timings"
    ],
    "dos_zh": [
      "保持每个中间件专注于单一关注点——认证中间件不应同时执行日志记录；改为组合独立的中间件",
      "始终调用 next() 或显式终止；在非终止中间件中忘记 next() 会静默挂起请求，是最常见的管道错误之一",
      "刻意排序中间件——安全和认证检查必须在业务逻辑之前；日志中间件应包装整个管道以捕获时间"
    ],
    "donts": [
      "Don't put significant business logic inside middleware — it makes the logic hard to test in isolation and blurs the separation between infrastructure and domain concerns",
      "Don't swallow errors inside middleware without either handling them fully or re-throwing them — silent error absorption prevents upstream error handlers from running",
      "Don't register too many middleware globally when only a few routes need them — route-specific middleware keeps pipelines lean and improves request tracing clarity"
    ],
    "donts_zh": [
      "不要在中间件内放置重要的业务逻辑——这使逻辑难以隔离测试，并模糊了基础设施和领域关注点之间的分离",
      "不要在中间件内静默吞噬错误而不完全处理或重新抛出——静默错误吸收会阻止上游错误处理程序运行",
      "当只有少数路由需要时，不要全局注册太多中间件——特定路由的中间件保持管道精简并提高请求追踪清晰度"
    ],
    "case_study_company": "Express.js / OpenJS Foundation",
    "case_study": "Express.js made the middleware pipeline pattern the de facto standard for Node.js web development. An Express application handling e-commerce requests chains: morgan (request logging), helmet (security headers), cors (CORS policy), express-rate-limit (abuse prevention), passport.authenticate (JWT verification), and finally the route handler itself. Each middleware is independently installable, testable with supertest, and replaceable without touching others. When Stripe integrated Express into their API development workflow, they used middleware chains to enforce API versioning, idempotency-key validation, and audit logging across all endpoints without duplicating these concerns inside individual route handlers.",
    "case_study_zh": "Express.js 使中间件管道模式成为 Node.js Web 开发的事实标准。处理电商请求的 Express 应用链接：morgan（请求日志）、helmet（安全头部）、cors（CORS 策略）、express-rate-limit（滥用防护）、passport.authenticate（JWT 验证），最终是路由处理程序本身。每个中间件都独立可安装、可用 supertest 测试，且可替换而不触及其他中间件。当 Stripe 将 Express 集成到其 API 开发工作流时，他们使用中间件链在所有端点强制执行 API 版本控制、幂等键验证和审计日志，而无需在单个路由处理程序内重复这些关注点。",
    "when_not_to_use": [
      "Simple single-step transformations where a plain function call is more readable than a pipeline of one",
      "Batch processing jobs with no need for cross-cutting HTTP concerns, where a direct sequence of function calls is easier to follow",
      "High-throughput low-latency paths where middleware function call overhead and context allocation per request is a measurable bottleneck"
    ],
    "when_not_to_use_zh": [
      "简单的单步转换，普通函数调用比只有一个中间件的管道更具可读性",
      "无需横切 HTTP 关注点的批处理作业，直接的函数调用序列更容易理解",
      "高吞吐量低延迟路径，每个请求的中间件函数调用开销和上下文分配是可测量的瓶颈"
    ],
    "adopters": [
      "Express.js / Koa.js (Node.js)",
      "ASP.NET Core (.NET)",
      "Django middleware (Python)",
      "LangChain (AI pipelines)"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Holowaychuk, T. (2010). Express.js framework documentation. expressjs.com.",
    "secondary_sources": [
      "Microsoft (2016). \"ASP.NET Core Middleware\". docs.microsoft.com.",
      "Gamma, E., Helm, R., Johnson, R., Vlissides, J. (1994). \"Design Patterns: Elements of Reusable Object-Oriented Software\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "decorator-pattern",
        "type": "related"
      },
      {
        "slug": "command-pattern",
        "type": "related"
      }
    ]
  },
  {
    "id": 277,
    "name": "Strangler Fig at Code Level",
    "name_zh": "代码层面的绞杀者模式",
    "slug": "strangler-fig-code-level",
    "category": "coding",
    "desc": "Gradually replacing legacy code modules by growing new implementations alongside old ones until the legacy can be safely removed",
    "desc_zh": "通过在旧实现旁边逐步构建新实现来替换遗留代码模块，直到遗留代码可以安全移除",
    "steps": [
      "Identify the legacy module boundary: find the seam — a function signature, interface, or module import — that represents the contract between the legacy code and its callers, and treat this boundary as your migration surface",
      "Create a parallel new implementation behind the same interface: write the replacement module with the same public API but a new internal implementation, using the Adapter pattern if the legacy interface cannot be changed",
      "Introduce a dispatch mechanism: add a feature flag, factory function, or routing layer that directs calls to either the old or new implementation based on a configuration toggle or percentage rollout",
      "Migrate callers incrementally: move one caller at a time to the new implementation, verifying behavior equivalence through side-by-side testing, shadow mode (new runs but results are discarded), or A/B comparison of outputs",
      "Delete the legacy code once all callers have been migrated and confidence is established: remove the dispatch mechanism, the old implementation, and all related scaffolding — the strangler fig is complete when the old tree is gone"
    ],
    "steps_zh": [
      "识别遗留模块边界：找到接缝——函数签名、接口或模块导入——代表遗留代码与其调用者之间的契约，将此边界作为迁移面",
      "在相同接口后面创建并行的新实现：用相同的公共API但全新的内部实现编写替换模块，如果无法更改遗留接口则使用适配器模式",
      "引入调度机制：添加功能标志、工厂函数或路由层，根据配置开关或百分比推出将调用定向到旧或新实现",
      "增量迁移调用者：每次迁移一个调用者到新实现，通过并排测试、影子模式（新实现运行但结果被丢弃）或输出的A/B比较验证行为等价性",
      "一旦所有调用者都已迁移且建立了信心，删除遗留代码：移除调度机制、旧实现和所有相关脚手架——当旧树消失时，绞杀者模式完成"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Legacy Code",
      "Strangler Facade",
      "New Module",
      "Migration"
    ],
    "viz_labels_zh": [
      "遗留代码",
      "绞杀者门面",
      "新模块",
      "迁移路径"
    ],
    "related": [
      "strangler-fig-pattern",
      "feature-flags",
      "adapter-pattern",
      "branch-by-abstraction",
      "solid-principles"
    ],
    "tags": [
      "refactoring",
      "legacy",
      "strangler-fig",
      "martin-fowler",
      "migration",
      "incremental",
      "code-level"
    ],
    "origin_author": "Martin Fowler",
    "origin_source": "Fowler, M. (2004). \"StranglerFigApplication\". martinfowler.com.",
    "origin_source_zh": "Fowler, M.（2004）.「StranglerFigApplication」. martinfowler.com.",
    "complexity": "intermediate",
    "when_to_use": [
      "Replacing a legacy module that is deeply embedded in business-critical code where a big-bang rewrite is too risky — the strangler pattern lets you migrate with continuous verification at each step",
      "Changing the implementation of a module without changing its interface, when downstream callers are too numerous or important to migrate all at once",
      "Teams practicing continuous delivery who need to keep the codebase always deployable during a multi-week or multi-month migration effort",
      "Extracting a function from a monolith into a service, where the module boundary in the monolith becomes the anti-corruption layer during the transition"
    ],
    "when_to_use_zh": [
      "替换深度嵌入业务关键代码的遗留模块，全量重写风险太高——绞杀者模式让你在每个步骤持续验证的同时进行迁移",
      "在不改变接口的情况下更改模块实现，当下游调用者太多或太重要而无法一次性全部迁移时",
      "实践持续交付的团队，在多周或多月的迁移工作中需要保持代码库始终可部署",
      "将函数从单体应用提取到服务中，单体应用中的模块边界在过渡期间成为防腐层"
    ],
    "core_concepts": [
      "Seam Identification: finding the natural boundaries in legacy code — function signatures, interfaces, module imports — that can serve as the dispatch point between old and new implementations without requiring changes to callers",
      "Branch by Abstraction: introducing an abstraction layer (interface, abstract class, or function type) over legacy code so that the new implementation can be developed and tested behind the same contract",
      "Feature Flag Dispatch: using a runtime or compile-time toggle to route calls between the old and new implementation, enabling incremental rollout and instant rollback if the new implementation shows regressions",
      "Shadow Mode: running both implementations in parallel but only returning results from the legacy, while logging new implementation outputs for comparison — eliminates risk of behavioral divergence before switching",
      "Deletion as Completion: the migration is not done until the legacy code is actually deleted; keeping both implementations permanently creates complexity debt; the strangler pattern is complete only when the old code is gone"
    ],
    "core_concepts_zh": [
      "接缝识别：在遗留代码中找到自然边界——函数签名、接口、模块导入——可以作为旧实现和新实现之间的调度点，无需更改调用者",
      "通过抽象分支：在遗留代码上引入抽象层（接口、抽象类或函数类型），使新实现可以在相同契约下开发和测试",
      "功能标志调度：使用运行时或编译时开关在旧实现和新实现之间路由调用，实现增量推出，如果新实现出现回归可立即回滚",
      "影子模式：并行运行两个实现但只返回遗留实现的结果，同时记录新实现输出用于比较——在切换前消除行为分歧的风险",
      "删除即完成：迁移在实际删除遗留代码之前尚未完成；永久保留两个实现会产生复杂度债务；绞杀者模式只有在旧代码消失时才算完成"
    ],
    "timeline": [
      [
        "2004",
        "Martin Fowler publishes 'StranglerFigApplication' on martinfowler.com, naming the pattern after the strangler fig tree that grows around and eventually replaces its host"
      ],
      [
        "2010",
        "The pattern gains traction in enterprise Java and .NET communities as teams tackle legacy system modernization without big-bang rewrites"
      ],
      [
        "2016",
        "Sam Newman's 'Building Microservices' popularizes the Strangler Fig as the primary pattern for incrementally extracting services from monoliths"
      ],
      [
        "2020",
        "The pattern becomes central to cloud migration playbooks as organizations strangling on-premise monoliths toward cloud-native microservices architectures"
      ]
    ],
    "timeline_zh": [
      [
        "2004",
        "Martin Fowler在martinfowler.com上发表「StranglerFigApplication」，以绞杀无花果树命名该模式，这种树会围绕并最终取代其宿主"
      ],
      [
        "2010",
        "随着团队在不进行全量重写的情况下处理遗留系统现代化，该模式在企业Java和.NET社区中获得关注"
      ],
      [
        "2016",
        "Sam Newman的《构建微服务》将绞杀者模式推广为从单体应用增量提取服务的主要模式"
      ],
      [
        "2020",
        "随着组织将本地单体应用绞杀为云原生微服务架构，该模式成为云迁移手册的核心"
      ]
    ],
    "dos": [
      "Do identify a clean seam before starting — if there is no clear interface boundary, create one first using Branch by Abstraction before implementing the new version",
      "Do use shadow mode (dark launch) for high-risk replacements where both implementations run but only the legacy result is used, allowing silent comparison of outputs before switching",
      "Do delete legacy code as soon as all callers are migrated — leaving both implementations alive permanently defeats the purpose and creates maintenance burden for future contributors",
      "Do maintain comprehensive behavior tests for the legacy module before starting the strangler, so you have a regression suite to validate the new implementation against"
    ],
    "dos_zh": [
      "在开始之前识别清晰的接缝——如果没有明确的接口边界，在实现新版本之前首先使用「通过抽象分支」创建一个",
      "对高风险替换使用影子模式（暗发布），两个实现都运行但只使用遗留结果，在切换前允许静默比较输出",
      "一旦所有调用者都迁移完毕立即删除遗留代码——永久保留两个实现会使目的落空并给未来贡献者带来维护负担",
      "在开始绞杀之前维护遗留模块的全面行为测试，这样你就有回归测试套件来验证新实现"
    ],
    "donts": [
      "Don't start a strangler migration without a clear deletion plan — migrations with no end date accumulate both implementations indefinitely, doubling the code that must be understood and maintained",
      "Don't change the interface while strangling — the whole point of the dispatch layer is that callers don't need to change; if you change the API simultaneously you are doing a rewrite, not a strangler",
      "Don't underestimate edge cases and implicit contracts in the legacy code — hidden coupling, undocumented side effects, and order-of-operations dependencies are the most common sources of behavioral divergence in strangler migrations",
      "Don't run both implementations in production indefinitely without a firm migration deadline — the dispatch layer itself becomes technical debt if no team owns removing it"
    ],
    "donts_zh": [
      "不要在没有明确删除计划的情况下开始绞杀迁移——没有截止日期的迁移会无限积累两个实现，使需要理解和维护的代码翻倍",
      "不要在绞杀过程中更改接口——调度层的全部意义在于调用者不需要更改；如果同时更改API，你在做的是重写，而非绞杀",
      "不要低估遗留代码中的边缘情况和隐式契约——隐藏的耦合、未记录的副作用和操作顺序依赖是绞杀迁移中行为分歧最常见的来源",
      "不要在没有明确迁移截止日期的情况下无限期在生产环境中运行两个实现——如果没有团队负责移除它，调度层本身会成为技术债务"
    ],
    "case_study_company": "LinkedIn",
    "case_study": "LinkedIn used the Strangler Fig pattern at code level to migrate their monolithic member-profile service from a legacy Oracle-backed Java persistence layer to a new Espresso (distributed document store) backend. Rather than rewriting the profile service in a single release, they introduced a ProfileRepository interface over both backends and a feature-flag-controlled dispatch layer. Individual profile attributes (name, headline, connections count) were migrated one field at a time, running both read paths in shadow mode to compare outputs. The full migration of the 1,400-field profile entity took 14 months and zero production incidents caused by the migration itself. The legacy Oracle profile tables were decommissioned as the last step, eliminating $4M/year in Oracle licensing costs.",
    "case_study_zh": "LinkedIn使用代码层面的绞杀者模式将其单体成员资料服务从遗留的Oracle支持Java持久层迁移到新的Espresso（分布式文档存储）后端。他们没有在单次发布中重写资料服务，而是在两个后端上引入了ProfileRepository接口和功能标志控制的调度层。单个资料属性（姓名、标题、连接数）每次迁移一个字段，以影子模式运行两个读取路径来比较输出。包含1400个字段的资料实体的完整迁移历时14个月，迁移本身没有造成任何生产事故。最后一步废弃遗留Oracle资料表，每年节省400万美元的Oracle许可费用。",
    "when_not_to_use": [
      "The legacy module has no identifiable interface seam and is deeply entangled with global state, making it impossible to run old and new implementations in parallel without cross-contamination",
      "The replacement requires a fundamentally different data model or protocol that cannot be hidden behind the existing interface without creating an awkward impedance mismatch",
      "The module is so small and low-risk that the overhead of implementing a dispatch layer, shadow mode, and incremental migration exceeds the risk of a direct replacement"
    ],
    "when_not_to_use_zh": [
      "遗留模块没有可识别的接口接缝，且与全局状态深度纠缠，使得无法在没有交叉污染的情况下并行运行旧实现和新实现",
      "替换需要根本不同的数据模型或协议，无法在现有接口后面隐藏而不产生尴尬的阻抗不匹配",
      "模块太小、风险太低，实现调度层、影子模式和增量迁移的开销超过直接替换的风险"
    ],
    "adopters": [
      "LinkedIn",
      "Shopify",
      "Etsy",
      "Booking.com",
      "ThoughtWorks",
      "Amazon"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Fowler, M. (2004). \"StranglerFigApplication\". martinfowler.com.",
    "secondary_sources": [
      "Newman, S. (2019). \"Monolith to Microservices\". O'Reilly Media.",
      "Feathers, M. (2004). \"Working Effectively with Legacy Code\". Prentice Hall.",
      "Humble, J. & Farley, D. (2010). \"Continuous Delivery\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "feature-flags",
        "type": "complement"
      },
      {
        "slug": "adapter-pattern",
        "type": "related"
      },
      {
        "slug": "solid-principles",
        "type": "complement"
      }
    ]
  },
  {
    "id": 278,
    "name": "Vertical Slice Architecture",
    "name_zh": "垂直切片架构",
    "slug": "vertical-slice-architecture",
    "category": "coding",
    "desc": "Organizing code by feature rather than by technical layer, grouping all code for a feature — from HTTP handler to database query — in a single cohesive slice",
    "desc_zh": "按功能而非技术层次组织代码，将一个功能的所有代码——从HTTP处理器到数据库查询——组合在单一的内聚切片中",
    "steps": [
      "Define the slices: identify the discrete features or use cases in your application (CreateOrder, GetProductCatalog, UserLogin) — each becomes a self-contained vertical slice with its own handler, validation, business logic, and data access code",
      "Co-locate everything for a feature: place the command/query object, handler, validator, and any DTOs in a single folder or file rather than splitting them across Controllers, Services, and Repositories directories",
      "Use MediatR (or equivalent mediator/command bus): dispatch commands and queries through a mediator so that the HTTP layer sends a message and doesn't directly depend on the handler implementation, enabling independent handler evolution",
      "Share carefully: create shared abstractions only for true cross-cutting concerns (authentication, logging, error handling) using middleware and behaviors — resist the urge to build shared service layers that couple all features",
      "Test each slice independently: write integration tests that test the full vertical slice from handler input to database output, validating the behavior of each feature as a unit without mocking internal implementation details"
    ],
    "steps_zh": [
      "定义切片：识别应用中的离散功能或用例（CreateOrder、GetProductCatalog、UserLogin）——每个都成为包含自己的处理器、验证、业务逻辑和数据访问代码的自包含垂直切片",
      "共置一个功能的所有内容：将命令/查询对象、处理器、验证器和任何DTO放在单一文件夹或文件中，而非将它们拆分到Controllers、Services和Repositories目录中",
      "使用MediatR（或等效的中介器/命令总线）：通过中介器分派命令和查询，使HTTP层发送消息而不直接依赖处理器实现，实现处理器的独立演化",
      "谨慎共享：仅为真正的横切关注点（认证、日志、错误处理）使用中间件和行为创建共享抽象——抵制构建将所有功能耦合在一起的共享服务层的冲动",
      "独立测试每个切片：编写从处理器输入到数据库输出测试完整垂直切片的集成测试，将每个功能的行为作为单元进行验证，无需模拟内部实现细节"
    ],
    "ai_relevant": false,
    "viz_type": "venn",
    "viz_labels": [
      "Feature Slice",
      "UI Layer",
      "Domain Logic"
    ],
    "viz_labels_zh": [
      "功能切片",
      "UI层",
      "领域逻辑"
    ],
    "related": [
      "solid-principles",
      "domain-driven-design",
      "hexagonal-architecture"
    ],
    "tags": [
      "vertical-slice",
      "architecture",
      "feature-organization",
      "cqrs",
      "mediatr",
      "jimmy-bogard",
      "dotnet"
    ],
    "origin_author": "Jimmy Bogard",
    "origin_source": "Bogard, J. (2018). \"Vertical Slice Architecture\". NDC Sydney conference talk and jimmybogard.com.",
    "origin_source_zh": "Bogard, J.（2018）.「垂直切片架构」. NDC Sydney会议演讲及jimmybogard.com.",
    "complexity": "intermediate",
    "when_to_use": [
      "Applications where adding a new feature requires touching files across many technical layers (controller, service, repository, DTO, mapper) creating high coordination cost and merge conflict frequency",
      "Teams that find traditional layered architecture creates excessive abstraction and ceremony for CRUD-heavy applications where most features have simple data access patterns",
      "Codebases where features are largely independent and cross-feature coupling is a code smell — vertical slices make dependencies visible and explicit rather than hidden in shared service layers",
      "Environments where feature teams own end-to-end slices and need to minimize the coordination cost of changing any single feature without affecting others"
    ],
    "when_to_use_zh": [
      "添加新功能需要触及许多技术层（控制器、服务、仓储、DTO、映射器）中的文件，产生高协调成本和合并冲突频率的应用",
      "发现传统分层架构为以CRUD为主的应用创建了过度抽象和仪式感的团队，这些应用大多数功能具有简单的数据访问模式",
      "功能大部分独立且跨功能耦合是代码异味的代码库——垂直切片使依赖关系可见且明确，而非隐藏在共享服务层中",
      "功能团队负责端到端切片并需要最小化在不影响其他功能的情况下更改任何单个功能的协调成本的环境"
    ],
    "core_concepts": [
      "Feature Cohesion: all code implementing a single user-facing feature lives together in one location — command object, validator, handler, and persistence logic — making each feature independently understandable and modifiable",
      "Minimal Shared Layers: instead of universal Services or Repository layers, share only true cross-cutting concerns (auth, logging, error handling) through middleware, behaviors, and pipeline decorators",
      "CQRS Alignment: vertical slices align naturally with Command Query Responsibility Segregation — commands and queries are the natural slice boundaries, each with its own handler and data access strategy",
      "Mediator Pattern: a command/query bus (MediatR in .NET) decouples the HTTP entry point from the handler implementation, enabling cross-cutting behaviors (validation, logging, caching) to be applied as pipeline behaviors",
      "Integration-First Testing: testing the full vertical slice from HTTP request to database response gives high confidence tests that are resilient to internal refactoring, compared to unit tests of individual layers"
    ],
    "core_concepts_zh": [
      "功能内聚：实现单个面向用户功能的所有代码放在同一位置——命令对象、验证器、处理器和持久化逻辑——使每个功能独立可理解和可修改",
      "最小共享层：不使用通用Service或Repository层，仅通过中间件、行为和管道装饰器共享真正的横切关注点（认证、日志、错误处理）",
      "CQRS对齐：垂直切片与命令查询职责分离自然对齐——命令和查询是自然的切片边界，每个都有自己的处理器和数据访问策略",
      "中介器模式：命令/查询总线（.NET中的MediatR）将HTTP入口点与处理器实现解耦，使横切行为（验证、日志、缓存）可以作为管道行为应用",
      "集成优先测试：从HTTP请求到数据库响应测试完整垂直切片，与单独层的单元测试相比，提供对内部重构具有弹性的高置信度测试"
    ],
    "timeline": [
      [
        "2004",
        "Alistair Cockburn describes Use Case Slices in his hexagonal architecture writings, prefiguring the vertical slice idea of organizing code around use cases"
      ],
      [
        "2013",
        "Jimmy Bogard creates MediatR for .NET, providing the command bus infrastructure that makes vertical slice architecture practical to implement"
      ],
      [
        "2018",
        "Bogard delivers 'Vertical Slice Architecture' at NDC Sydney, coining the term and articulating the philosophy as an alternative to Clean Architecture's layer separation"
      ],
      [
        "2022",
        "The pattern gains significant adoption in .NET, Go, and TypeScript communities as teams look for alternatives to over-engineered layered architectures"
      ]
    ],
    "timeline_zh": [
      [
        "2004",
        "Alistair Cockburn在其六边形架构文章中描述了用例切片，预示了围绕用例组织代码的垂直切片思想"
      ],
      [
        "2013",
        "Jimmy Bogard为.NET创建MediatR，提供使垂直切片架构实际可实现的命令总线基础设施"
      ],
      [
        "2018",
        "Bogard在NDC Sydney发表「垂直切片架构」，创造该术语并将其理念阐述为Clean Architecture层分离的替代方案"
      ],
      [
        "2022",
        "随着团队寻找过度工程化的分层架构的替代方案，该模式在.NET、Go和TypeScript社区中获得显著采用"
      ]
    ],
    "dos": [
      "Do treat each slice as potentially having different internal structure — a simple read query might be a single function, while a complex write command might use a full domain model; don't force uniformity across slices",
      "Do use the mediator pipeline for cross-cutting behaviors (validation, authorization, logging, transactions) so these apply consistently to all handlers without each handler needing to call them explicitly",
      "Do write slice-level integration tests rather than isolated unit tests for each layer — testing the full slice from input to output gives confidence that the feature works end-to-end",
      "Do start with feature-folder organization even if you don't adopt the full CQRS/MediatR stack — co-locating related files is the core principle and can be applied independently of the tooling"
    ],
    "dos_zh": [
      "将每个切片视为可能具有不同内部结构——简单的读取查询可能是单个函数，而复杂的写入命令可能使用完整的领域模型；不要强制跨切片的统一性",
      "使用中介器管道处理横切行为（验证、授权、日志、事务），使这些行为一致地应用于所有处理器，而无需每个处理器显式调用它们",
      "编写切片级集成测试而非每层的隔离单元测试——从输入到输出测试完整切片，确保功能端到端正常工作",
      "即使不采用完整的CQRS/MediatR技术栈，也从功能文件夹组织开始——共置相关文件是核心原则，可以独立于工具链应用"
    ],
    "donts": [
      "Don't create a SharedService layer that multiple slices depend on — this reintroduces the coupling that vertical slices are designed to eliminate; if logic is truly shared, promote it to a domain concept",
      "Don't confuse vertical slices with microservices — slices are a code organization pattern within a single deployable unit; they are about developer experience and maintainability, not deployment topology",
      "Don't apply vertical slices mechanically to every project — highly domain-complex applications (rich domain models with deep invariants) may benefit from traditional DDD layering within each slice",
      "Don't skip defining the slice boundaries carefully — if slice boundaries don't align with actual user-facing features, you end up with arbitrary code groupings that don't reduce coupling"
    ],
    "donts_zh": [
      "不要创建多个切片依赖的SharedService层——这重新引入了垂直切片旨在消除的耦合；如果逻辑真正共享，将其提升为领域概念",
      "不要将垂直切片与微服务混淆——切片是单个可部署单元内的代码组织模式；它们关于的是开发者体验和可维护性，而非部署拓扑",
      "不要机械地将垂直切片应用于每个项目——具有深度不变量的高度领域复杂应用（丰富领域模型）可能受益于每个切片内的传统DDD分层",
      "不要跳过仔细定义切片边界——如果切片边界与实际面向用户的功能不对齐，你最终得到的是不能减少耦合的任意代码分组"
    ],
    "case_study_company": "Headspring (now Accenture)",
    "case_study": "Headspring, the consulting firm where Jimmy Bogard worked, applied Vertical Slice Architecture across multiple enterprise client projects. In a large insurance claims processing system with 200+ features, they replaced a traditional N-Tier architecture (Controllers → Services → Repositories) with feature folders, each containing the command/query, validator, handler, and any supporting types. The result was that a developer could understand a complete feature by reading a single folder rather than tracing code through 6 layers across 6 directories. Adding a new feature required touching exactly one new folder. Feature development velocity increased because teams stopped stepping on each other's changes — slice isolation meant that two developers could work on different features with zero merge conflicts in shared service layers. Code review complexity dropped as each PR contained one complete, self-contained behavior change.",
    "case_study_zh": "Jimmy Bogard工作的咨询公司Headspring（现已被埃森哲收购）在多个企业客户项目中应用了垂直切片架构。在一个拥有200多个功能的大型保险理赔处理系统中，他们用功能文件夹替换了传统的N层架构（控制器→服务→仓储），每个文件夹包含命令/查询、验证器、处理器和任何支持类型。结果是开发者可以通过阅读单个文件夹来理解完整功能，而不必在6个目录的6层代码中追踪。添加新功能只需要触及一个新文件夹。功能开发速度提升，因为团队不再相互踩踏彼此的变更——切片隔离意味着两个开发者可以在不同功能上工作而共享服务层零合并冲突。代码审查复杂性下降，因为每个PR都包含一个完整的、自包含的行为变更。",
    "case_study_challenge": "A large insurance claims processing system with over 200 features was built on traditional N-Tier architecture — Controllers, Services, Repositories spread across six directories. Understanding a single feature required tracing code through six layers. Two developers working on different features routinely caused merge conflicts in shared service classes.",
    "case_study_challenge_zh": "一个拥有200多个功能的大型保险理赔处理系统建立在传统N层架构上——控制器、服务、仓储分散在六个目录中。理解单个功能需要在六层代码中追踪。两个开发者在不同功能上工作时，经常在共享服务类中产生合并冲突。",
    "case_study_approach": "Headspring replaced the layered structure with feature folders — each containing the command or query, its validator, its handler, and any supporting types. Instead of organizing code by technical concern, they organized it by user-facing behavior. Each folder was a complete, self-contained vertical slice of the application.",
    "case_study_approach_zh": "Headspring用功能文件夹替换了分层结构——每个文件夹包含命令或查询、其验证器、处理器及任何支持类型。他们不再按技术关注点组织代码，而是按面向用户的行为组织。每个文件夹都是应用程序的一个完整、自包含的垂直切片。",
    "case_study_result": "A developer could understand a complete feature by reading a single folder. Adding a new feature meant creating exactly one new folder. Merge conflicts in shared layers dropped to zero. Code review complexity fell sharply — each pull request contained one complete, self-contained behavior change instead of scattered modifications across six directories.",
    "case_study_result_zh": "开发者只需阅读一个文件夹就能理解完整功能。新增功能意味着仅创建一个新文件夹。共享层的合并冲突降至零。代码审查复杂度大幅下降——每个PR包含一个完整的、自包含的行为变更，而非分散在六个目录中的零碎修改。",
    "case_study_quote": "The question stopped being 'which layer does this go in?' and became 'which feature does this belong to?' That one shift changed everything.",
    "case_study_quote_zh": "问题不再是「这应该放在哪一层？」而变成了「这属于哪个功能？」仅此一个转变就改变了一切。",
    "when_not_to_use": [
      "Highly domain-complex applications where DDD bounded contexts with rich aggregates, domain events, and deep invariants provide more value than the simplicity of flat feature folders",
      "Libraries and frameworks (not applications) where the organizational unit is the public API surface area, not user-facing features",
      "Small applications with 10-20 features where any organizational structure provides sufficient clarity and the overhead of setting up a mediator pattern exceeds the benefit"
    ],
    "when_not_to_use_zh": [
      "高度领域复杂的应用，DDD限界上下文（具有丰富聚合、领域事件和深度不变量）提供的价值超过扁平功能文件夹的简洁性",
      "库和框架（非应用），其中组织单元是公共API表面积，而非面向用户的功能",
      "拥有10-20个功能的小型应用，任何组织结构都提供足够的清晰度，设置中介器模式的开销超过收益"
    ],
    "adopters": [
      "Headspring (Accenture)",
      "JetBrains",
      "Microsoft (internal services)",
      "Stack Overflow",
      "Ardalis (Steve Smith)",
      "Contoso (reference implementations)"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability",
      "testability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Bogard, J. (2018). \"Vertical Slice Architecture\". NDC Sydney. jimmybogard.com.",
    "secondary_sources": [
      "Bogard, J. (2013). \"MediatR: Simple mediator implementation in .NET\". github.com/jbogard/MediatR.",
      "Smith, S. (2020). \"Vertical Slices\". ardalis.com.",
      "Percival, H. & Gregory, B. (2020). \"Architecture Patterns with Python\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "domain-driven-design",
        "type": "related"
      }
    ]
  },
  {
    "id": 279,
    "name": "Specification Pattern",
    "name_zh": "规格模式",
    "slug": "specification-pattern",
    "category": "coding",
    "desc": "Encapsulating business rules as composable, reusable objects that can be combined with boolean logic to express complex domain predicates",
    "desc_zh": "将业务规则封装为可组合、可复用的对象，通过布尔逻辑组合来表达复杂的领域谓词",
    "steps": [
      "Identify business rules that are duplicated across the codebase or mixed into query logic and UI validation — these are the candidates for extraction into explicit Specification objects",
      "Define a Specification interface with a single IsSatisfiedBy(entity) method (or an expression property for database-pushable specs) that encapsulates one cohesive business rule",
      "Implement concrete specifications for each atomic rule: PremiumCustomerSpec, ActiveAccountSpec, EligibleForDiscountSpec — each class encodes one well-named business concept",
      "Implement composition operators on the Specification base class: And(), Or(), Not() — so complex rules can be expressed as new PremiumCustomerSpec().And(new ActiveAccountSpec()) rather than inline boolean logic",
      "Apply specifications consistently: use them in domain validation, in query filters (translating specs to database predicates via expression trees), and in UI to drive conditional visibility, ensuring the rule is defined once and used everywhere"
    ],
    "steps_zh": [
      "识别在代码库中重复或混入查询逻辑和UI验证的业务规则——这些是提取为显式Specification对象的候选",
      "定义带有单一IsSatisfiedBy(entity)方法（或用于数据库可推送规格的表达式属性）的Specification接口，封装一个内聚的业务规则",
      "为每个原子规则实现具体规格：PremiumCustomerSpec、ActiveAccountSpec、EligibleForDiscountSpec——每个类编码一个命名良好的业务概念",
      "在Specification基类上实现组合运算符：And()、Or()、Not()——使复杂规则可以表达为new PremiumCustomerSpec().And(new ActiveAccountSpec())，而非内联布尔逻辑",
      "一致地应用规格：在领域验证、查询过滤器（通过表达式树将规格转换为数据库谓词）和UI中使用它们来驱动条件可见性，确保规则定义一次随处使用"
    ],
    "ai_relevant": false,
    "viz_type": "venn",
    "viz_labels": [
      "Specification",
      "And Spec",
      "Or Spec"
    ],
    "viz_labels_zh": [
      "规格",
      "且规格",
      "或规格"
    ],
    "related": [
      "domain-driven-design",
      "solid-principles",
      "repository-pattern"
    ],
    "tags": [
      "specification-pattern",
      "ddd",
      "eric-evans",
      "business-rules",
      "domain-logic",
      "composable",
      "predicate"
    ],
    "origin_author": "Eric Evans",
    "origin_source": "Evans, E. & Fowler, M. (1997). \"Specifications\". martinfowler.com. Elaborated in Domain-Driven Design (Evans, 2003).",
    "origin_source_zh": "Evans, E. & Fowler, M.（1997）.「规格」. martinfowler.com. 在《领域驱动设计》（Evans，2003）中进一步阐述。",
    "complexity": "intermediate",
    "when_to_use": [
      "Business rules that are duplicated across multiple locations (domain validation, query filters, UI conditionals) and need a single canonical definition that can be reused without copy-paste",
      "Complex eligibility or filtering logic that is currently expressed as hard-to-read boolean chains in service methods or repository queries",
      "Domain-rich applications where business rules should be named, documented, and discoverable as first-class domain concepts rather than anonymous lambda expressions",
      "Systems where business rules are frequently added or changed and need to be testable in isolation without spinning up the full application or database"
    ],
    "when_to_use_zh": [
      "在多个位置重复出现（领域验证、查询过滤器、UI条件）并需要单一规范定义的业务规则，可以复用而无需复制粘贴",
      "当前在服务方法或仓储查询中表达为难以阅读的布尔链的复杂资格或过滤逻辑",
      "业务规则应被命名、记录并作为一等领域概念可发现的领域丰富应用，而非匿名lambda表达式",
      "业务规则频繁添加或更改，需要在不启动完整应用或数据库的情况下独立可测试的系统"
    ],
    "core_concepts": [
      "Specification as Domain Object: a specification is a first-class domain concept — a named business rule (EligibleForLoyaltyReward, PastDueInvoice) that is explicit, documented, and discoverable rather than hidden in anonymous predicates",
      "Composability: specifications support boolean composition (And, Or, Not) so complex rules are expressed by combining atomic specifications rather than writing nested boolean expressions, keeping each rule independently readable",
      "Dual-Use Specification: a well-implemented specification can be used both for in-memory validation (IsSatisfiedBy(entity)) and for database query generation (ToExpression() → IQueryable<T>.Where(spec)) using expression trees",
      "Rule Centralization: by encoding each business rule in exactly one specification class, changes to the rule (eligibility threshold changes, new exceptions) propagate automatically everywhere the specification is used",
      "Separation of What from How: specifications separate what the business rule is from how it is applied — the same EligibleForDiscountSpec can drive domain validation, database filtering, and UI rendering without any of these contexts knowing about each other"
    ],
    "core_concepts_zh": [
      "规格即领域对象：规格是一等领域概念——命名的业务规则（EligibleForLoyaltyReward、PastDueInvoice），明确、有文档且可发现，而非隐藏在匿名谓词中",
      "可组合性：规格支持布尔组合（And、Or、Not），复杂规则通过组合原子规格来表达，而非编写嵌套布尔表达式，保持每条规则独立可读",
      "双用规格：良好实现的规格既可用于内存中验证（IsSatisfiedBy(entity)），也可通过表达式树用于数据库查询生成（ToExpression() → IQueryable<T>.Where(spec)）",
      "规则集中化：通过在恰好一个规格类中编码每条业务规则，规则的变更（资格阈值变化、新例外）自动传播到使用该规格的所有地方",
      "将是什么与如何分离：规格将业务规则是什么与如何应用分离——相同的EligibleForDiscountSpec可以驱动领域验证、数据库过滤和UI渲染，而这些上下文之间互不了解"
    ],
    "timeline": [
      [
        "1997",
        "Eric Evans and Martin Fowler publish 'Specifications', a technical paper introducing the Specification pattern as a way to express business rules as objects"
      ],
      [
        "2003",
        "Evans' 'Domain-Driven Design' book includes Specification as a tactical DDD pattern, establishing it as a core building block for domain modeling"
      ],
      [
        "2010",
        "Expression tree-based specifications emerge in .NET (LINQ) and Java (JPA Criteria API) enabling specifications to translate into database queries, making the pattern practical for repository filtering"
      ],
      [
        "2018",
        "Ardalis (Steve Smith) open-sources the Specification library for .NET, providing a production-ready implementation that brings the pattern to mainstream .NET development"
      ]
    ],
    "timeline_zh": [
      [
        "1997",
        "Eric Evans和Martin Fowler发表「规格」技术论文，将规格模式作为将业务规则表达为对象的方式引入"
      ],
      [
        "2003",
        "Evans的《领域驱动设计》书籍将规格作为战术DDD模式纳入，确立其作为领域建模核心构建块的地位"
      ],
      [
        "2010",
        ".NET（LINQ）和Java（JPA Criteria API）中出现基于表达式树的规格，使规格可以转换为数据库查询，使该模式在仓储过滤中实际可用"
      ],
      [
        "2018",
        "Ardalis（Steve Smith）为.NET开源规格库，提供生产就绪的实现，将该模式带入主流.NET开发"
      ]
    ],
    "dos": [
      "Do give each specification a meaningful domain name that a business stakeholder would recognize — PremiumCustomerSpecification is better than CustomerTypeEqualsThreeSpecification",
      "Do implement specifications as expression trees (not just in-memory predicates) when they need to be applied as database query filters — in-memory-only specs that must load all records to filter are a performance anti-pattern",
      "Do compose specifications using And/Or/Not operators rather than creating new mega-specifications that duplicate the logic of existing specs — composition is the primary value of the pattern",
      "Do test specifications in isolation with unit tests covering both the positive case (entities that satisfy the spec) and negative cases (entities that don't), including edge cases at boundary conditions"
    ],
    "dos_zh": [
      "为每个规格取一个业务利益相关方能够识别的有意义的领域名称——PremiumCustomerSpecification比CustomerTypeEqualsThreeSpecification更好",
      "当规格需要作为数据库查询过滤器应用时，将其实现为表达式树（不仅仅是内存中谓词）——必须加载所有记录才能过滤的纯内存规格是性能反模式",
      "使用And/Or/Not运算符组合规格，而非创建重复现有规格逻辑的大型规格——组合是该模式的主要价值",
      "通过单元测试独立测试规格，覆盖正向情况（满足规格的实体）和负向情况（不满足的实体），包括边界条件的边缘情况"
    ],
    "donts": [
      "Don't put business logic inside specifications beyond the single rule they represent — a specification that checks 5 unrelated conditions is a rule engine, not a specification, and should be decomposed",
      "Don't use specifications for infrastructure concerns (is the database available? is the cache warm?) — specifications are for domain business rules about domain entities, not for infrastructure state checks",
      "Don't make specifications depend on external services, repositories, or I/O — a specification that calls a database or API to evaluate its predicate cannot be composed efficiently and makes testing require mocking infrastructure",
      "Don't over-apply the pattern to every boolean check in the codebase — simple one-off predicates that are used in exactly one place don't benefit from being extracted into a named specification class"
    ],
    "donts_zh": [
      "不要在规格中放入超出其所代表单一规则的业务逻辑——检查5个不相关条件的规格是规则引擎，而非规格，应该被分解",
      "不要将规格用于基础设施关注点（数据库是否可用？缓存是否预热？）——规格用于领域实体的领域业务规则，而非基础设施状态检查",
      "不要让规格依赖外部服务、仓储或I/O——调用数据库或API来评估其谓词的规格无法高效组合，使测试需要模拟基础设施",
      "不要对代码库中的每个布尔检查过度应用该模式——在恰好一个地方使用的简单一次性谓词不会从被提取到命名规格类中受益"
    ],
    "case_study_company": "Ardalis / Microsoft",
    "case_study": "Steve Smith (Ardalis), a Microsoft MVP, developed and open-sourced the Ardalis.Specification library for .NET after working on large enterprise applications where business eligibility rules were scattered as anonymous LINQ predicates across hundreds of repository methods. In one insurance application case study, the rule 'a policy is eligible for renewal' was duplicated in 14 places across the codebase — repository queries, domain services, API controllers, and Blazor component visibility conditions — with subtle variations in 3 of those locations. After extracting it into a single PolicyEligibleForRenewalSpec with expression tree support, the team eliminated all duplicates. When the business changed the eligibility criteria (adding a payment status check), the change was made in one file and propagated automatically to all 14 usage sites with zero regressions.",
    "case_study_zh": "微软MVP Steve Smith（Ardalis）在处理大型企业应用后，开发并开源了.NET的Ardalis.Specification库。在这些应用中，业务资格规则作为匿名LINQ谓词散布在数百个仓储方法中。在一个保险应用案例研究中，「保单是否符合续保条件」规则在代码库的14个地方重复——仓储查询、领域服务、API控制器和Blazor组件可见性条件——其中3个地方存在细微差异。将其提取为带表达式树支持的单一PolicyEligibleForRenewalSpec后，团队消除了所有重复。当业务更改资格标准（添加付款状态检查）时，变更在一个文件中进行，自动传播到所有14个使用点，零回归。",
    "when_not_to_use": [
      "Simple CRUD applications where business rules are few, stable, and not duplicated — the overhead of the Specification pattern exceeds its organizational benefit for trivial data validation",
      "Scripts and one-off data processing jobs where no code reuse across contexts is required and named domain concepts add no value",
      "Performance-critical hot paths where the expression tree compilation overhead of database-translatable specifications introduces measurable latency on every request"
    ],
    "when_not_to_use_zh": [
      "业务规则少、稳定且不重复的简单CRUD应用——规格模式的开销超过其对简单数据验证的组织收益",
      "脚本和一次性数据处理作业，不需要跨上下文的代码复用，命名领域概念不增加价值",
      "性能关键热路径，数据库可转换规格的表达式树编译开销在每次请求上引入可测量的延迟"
    ],
    "adopters": [
      "Ardalis (Steve Smith)",
      "Microsoft (internal DDD projects)",
      "NopCommerce",
      "eShopOnWeb (Microsoft reference)",
      "JetBrains Rider (domain logic)",
      "Umbraco CMS"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "testability"
    ],
    "maturity_ring": "established",
    "primary_source": "Evans, E. & Fowler, M. (1997). \"Specifications\". martinfowler.com.",
    "secondary_sources": [
      "Evans, E. (2003). \"Domain-Driven Design: Tackling Complexity in the Heart of Software\". Addison-Wesley.",
      "Smith, S. (2018). \"Ardalis.Specification: Specification pattern implementation for .NET\". github.com/ardalis/Specification.",
      "Vernon, V. (2013). \"Implementing Domain-Driven Design\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "domain-driven-design",
        "type": "complement"
      },
      {
        "slug": "repository-pattern",
        "type": "complement"
      },
      {
        "slug": "solid-principles",
        "type": "related"
      }
    ]
  },
  {
    "id": 299,
    "name": "Flyweight Pattern",
    "name_zh": "享元模式",
    "slug": "flyweight-pattern",
    "category": "coding",
    "desc": "GoF structural pattern that minimises memory usage by sharing fine-grained objects whose state can be externalised, enabling large numbers of similar objects to be represented efficiently.",
    "desc_zh": "GoF 结构型模式，通过共享可外化状态的细粒度对象来最小化内存使用，使大量相似对象得以高效表示。",
    "steps": [
      "Analyse the large collection of objects to separate intrinsic state (immutable, shareable, context-independent — e.g., the glyph shape of a character) from extrinsic state (context-dependent — e.g., the position and colour of each character on screen)",
      "Create a Flyweight interface that defines the operation method accepting extrinsic state as a parameter, so shared flyweight objects do not store it internally",
      "Implement Concrete Flyweight classes that store only intrinsic state; multiple client contexts can share the same Concrete Flyweight instance without conflict",
      "Build a Flyweight Factory that maintains a pool of existing flyweight instances keyed by their intrinsic state — the factory returns an existing instance when the key matches, or creates and caches a new one",
      "Replace direct object instantiation in client code with factory calls; pass extrinsic state as arguments to flyweight operations at call time rather than storing it in each flyweight instance"
    ],
    "steps_zh": [
      "分析大量对象集合，将内部状态（不可变、可共享、与上下文无关——如字符的字形）与外部状态（依赖上下文——如每个字符在屏幕上的位置和颜色）分离",
      "创建 Flyweight 接口，定义接受外部状态作为参数的操作方法，使共享的享元对象不在内部存储外部状态",
      "实现只存储内部状态的具体 Flyweight 类；多个客户端上下文可以共享同一具体享元实例而不产生冲突",
      "构建 Flyweight 工厂，维护以内部状态为键的现有享元实例池——当键匹配时工厂返回现有实例，否则创建并缓存新实例",
      "在客户端代码中用工厂调用替换直接对象实例化；在调用时将外部状态作为参数传递给享元操作，而不是将其存储在每个享元实例中"
    ],
    "ai_relevant": false,
    "viz_type": "tree",
    "viz_labels": [
      "Flyweight",
      "Intrinsic State",
      "Extrinsic State",
      "Factory"
    ],
    "viz_labels_zh": [
      "享元",
      "内部状态",
      "外部状态",
      "享元工厂"
    ],
    "related": [
      "singleton-pattern"
    ],
    "tags": [
      "flyweight",
      "gof",
      "structural-pattern",
      "memory-optimization",
      "object-sharing",
      "design-patterns"
    ],
    "origin_author": "Gang of Four",
    "origin_source": "Gamma, E., Helm, R., Johnson, R., & Vlissides, J. (1994). \"Design Patterns: Elements of Reusable Object-Oriented Software\". Addison-Wesley.",
    "origin_source_zh": "Gamma, E., Helm, R., Johnson, R., & Vlissides, J.（1994）.《设计模式：可复用面向对象软件的基础》. Addison-Wesley.",
    "complexity": "intermediate",
    "when_to_use": [
      "Applications that create large numbers of fine-grained objects — thousands or millions — where memory footprint is a critical constraint and many objects share common state",
      "Rendering engines, text editors, game engines, and CAD applications where visual elements (glyphs, sprites, tiles) are repeated many times with only position and colour varying",
      "Systems where object creation cost (memory allocation, GC pressure) is measurably degrading performance and profiling confirms that object count is the root cause",
      "Domain models with many instances of value-like objects (e.g., product SKUs, currency instances, tax codes) where identity equality is not required and shared instances are semantically correct"
    ],
    "when_to_use_zh": [
      "创建大量细粒度对象（成千上万甚至数百万）的应用，内存占用是关键约束，且许多对象共享公共状态",
      "渲染引擎、文本编辑器、游戏引擎和 CAD 应用，其中视觉元素（字形、精灵、瓦片）被重复多次，仅位置和颜色不同",
      "对象创建成本（内存分配、GC 压力）可测量地降低了性能，且性能分析确认对象数量是根本原因的系统",
      "拥有许多值类型对象实例（如产品 SKU、货币实例、税码）的领域模型，不需要标识相等性且共享实例在语义上正确"
    ],
    "core_concepts": [
      "Intrinsic vs Extrinsic State: intrinsic state is stored inside the flyweight and is immutable and shared; extrinsic state is context-dependent and must be passed to the flyweight operations by the caller — this separation is the structural foundation of the pattern",
      "Flyweight Factory: a factory or registry that ensures only one flyweight instance exists per unique intrinsic state; clients always go through the factory rather than directly instantiating flyweights to guarantee sharing",
      "Shared Identity: because flyweights are shared, they must not be treated as having object identity — two references to the same flyweight represent different logical objects that happen to share implementation; equality checks on flyweight identity are a common bug",
      "Memory-Time Trade-off: the pattern trades CPU time (extrinsic state must be computed or passed at every operation call) for memory (fewer objects on the heap); this trade-off is only worthwhile when object count is the dominant memory consumer"
    ],
    "core_concepts_zh": [
      "内部状态与外部状态：内部状态存储在享元内部，是不可变且共享的；外部状态依赖上下文，必须由调用方传递给享元操作——这种分离是该模式的结构基础",
      "享元工厂：确保每个唯一内部状态只存在一个享元实例的工厂或注册表；客户端始终通过工厂而非直接实例化享元来保证共享",
      "共享标识：由于享元是共享的，不能将其视为具有对象标识——对同一享元的两个引用代表碰巧共享实现的不同逻辑对象；对享元标识的相等性检查是常见的 bug",
      "内存-时间权衡：该模式以 CPU 时间（每次操作调用时必须计算或传递外部状态）换取内存（堆上更少的对象）；仅当对象数量是主要内存消耗者时这种权衡才值得"
    ],
    "timeline": [
      [
        "1990",
        "Paul Calder and Mark Linton introduce the Flyweight concept in the Unidraw graphical editing framework for X Windows, where thousands of identical glyph objects motivated sharing"
      ],
      [
        "1994",
        "Gang of Four canonise Flyweight as one of the 23 design patterns in Design Patterns: Elements of Reusable Object-Oriented Software, popularising it across the industry"
      ],
      [
        "2002",
        "Java String interning and Integer cache implement implicit flyweight sharing in the JVM standard library, embedding the pattern in the language runtime"
      ],
      [
        "2015",
        "JavaScript engines implement hidden class sharing and string interning as implicit flyweights; React virtual DOM key reconciliation uses flyweight-like object reuse to minimise allocations"
      ]
    ],
    "timeline_zh": [
      [
        "1990",
        "Paul Calder 和 Mark Linton 在 X Windows 的 Unidraw 图形编辑框架中引入享元概念，其中成千上万个相同字形对象促使了共享机制的诞生"
      ],
      [
        "1994",
        "Gang of Four 在《设计模式：可复用面向对象软件的基础》中将享元模式规范化为 23 种设计模式之一，在业界广泛推广"
      ],
      [
        "2002",
        "Java 字符串内部化和 Integer 缓存在 JVM 标准库中实现了隐式享元共享，将该模式嵌入语言运行时"
      ],
      [
        "2015",
        "JavaScript 引擎实现隐藏类共享和字符串内部化作为隐式享元；React 虚拟 DOM 键调和使用类享元的对象复用来最小化内存分配"
      ]
    ],
    "dos": [
      "Do profile memory usage before applying Flyweight — the pattern adds significant architectural complexity and is only justified when object count is measurably consuming too much heap memory",
      "Do make flyweight intrinsic state truly immutable — any mutation of shared state causes subtle, hard-to-debug corruption that affects all clients sharing the same flyweight instance",
      "Do design the Flyweight Factory as a thread-safe singleton when flyweights are shared across concurrent threads — double-checked locking or ConcurrentHashMap-based caching prevents race conditions on the factory",
      "Do document which fields are intrinsic and which are extrinsic clearly in code and tests — confusion about the intrinsic/extrinsic boundary is the most common source of flyweight bugs"
    ],
    "dos_zh": [
      "在应用享元模式之前先分析内存使用情况——该模式会增加显著的架构复杂性，仅在对象数量可测量地消耗过多堆内存时才合理",
      "确保享元的内部状态真正不可变——对共享状态的任何修改都会导致微妙且难以调试的损坏，影响共享同一享元实例的所有客户端",
      "当享元在并发线程间共享时，将享元工厂设计为线程安全的单例——双重检查锁定或基于 ConcurrentHashMap 的缓存可防止工厂中的竞争条件",
      "在代码和测试中清楚地记录哪些字段是内部状态、哪些是外部状态——对内部/外部边界的混淆是享元 bug 最常见的来源"
    ],
    "donts": [
      "Do not apply Flyweight prematurely as an optimisation strategy — it dramatically increases code complexity through the intrinsic/extrinsic split and factory indirection; use it only after profiling confirms memory pressure from excessive object count",
      "Do not share flyweights across isolation boundaries in multi-tenant systems without careful access control — shared flyweight instances can become a cross-tenant data leakage vector if extrinsic state is accidentally stored inside them",
      "Do not confuse Flyweight with Singleton — Singleton allows exactly one instance of an object regardless of state; Flyweight allows one instance per unique intrinsic state value, so multiple flyweight instances can exist",
      "Do not store extrinsic state in flyweight fields as an optimisation shortcut — this defeats the sharing guarantee and gradually converts flyweights into regular heavyweight objects as the codebase evolves"
    ],
    "donts_zh": [
      "不要将享元模式作为优化策略过早应用——内部/外部分离和工厂间接层会大幅增加代码复杂性；仅在性能分析确认过多对象数量导致内存压力后才使用",
      "在多租户系统中不要在没有仔细访问控制的情况下跨隔离边界共享享元——如果外部状态被意外存储在享元内部，共享的享元实例可能成为跨租户数据泄漏的向量",
      "不要将享元与单例混淆——单例允许恰好一个对象实例而不考虑状态；享元允许每个唯一内部状态值一个实例，因此可以存在多个享元实例",
      "不要将外部状态作为优化捷径存储在享元字段中——这会破坏共享保证，并随着代码库的演进逐渐将享元转变为普通的重量级对象"
    ],
    "case_study_company": "Java JDK / OpenJDK",
    "case_study": "The Java Development Kit implements the Flyweight pattern implicitly in several core library features. Integer.valueOf(int) in Java 5+ caches Integer instances for values between -128 and 127, the range most frequently used in loops and indexing — rather than creating a new Integer object for every boxing operation, the JVM returns a shared flyweight instance from a pre-populated cache array. For String, Java maintains a string intern pool: String.intern() returns the canonical flyweight instance for any given string value. These JVM-level flyweights have a measurable impact at scale: in high-throughput services processing millions of short integer counters or repeated string tokens, the Integer cache alone can reduce GC pressure by eliminating tens of millions of short-lived allocations per second, meaningfully reducing pause times in applications running on JVMs with concurrent garbage collectors.",
    "case_study_zh": "Java 开发工具包在多个核心库功能中隐式实现了享元模式。Java 5+ 中的 Integer.valueOf(int) 为 -128 到 127 之间的值（最常用于循环和索引的范围）缓存 Integer 实例——JVM 从预填充的缓存数组返回共享享元实例，而非为每次装箱操作创建新的 Integer 对象。对于 String，Java 维护一个字符串内部化池：String.intern() 返回任意字符串值的规范享元实例。这些 JVM 级享元在大规模场景下具有可测量的影响：在处理数百万个短整数计数器或重复字符串标记的高吞吐量服务中，Integer 缓存本身可以通过每秒消除数千万个短暂分配来减轻 GC 压力，有效降低使用并发垃圾收集器的 JVM 上的应用停顿时间。",
    "when_not_to_use": [
      "Applications with relatively few objects where the memory savings do not justify the architectural overhead of separating intrinsic from extrinsic state and routing all creation through a factory",
      "Objects with mostly unique state where the intrinsic state is too fine-grained to enable sharing — if every instance has different intrinsic state, the Flyweight Factory becomes a plain object cache with no sharing benefit",
      "Performance-critical paths where passing extrinsic state as a parameter on every operation call introduces function call overhead that outweighs the memory savings",
      "Mutable objects where the shared state must be modified independently per client — mutability and sharing are fundamentally incompatible in Flyweight"
    ],
    "when_not_to_use_zh": [
      "对象数量相对较少的应用，内存节省不足以证明将内部状态与外部状态分离以及通过工厂路由所有创建操作的架构开销是合理的",
      "大多数状态唯一的对象，内部状态粒度太细无法实现共享——如果每个实例都有不同的内部状态，享元工厂就变成了没有共享优势的普通对象缓存",
      "每次操作调用时将外部状态作为参数传递引入的函数调用开销超过内存节省的性能关键路径",
      "必须按客户端独立修改共享状态的可变对象——可变性和共享在享元中是根本不相容的"
    ],
    "adopters": [
      "Java JDK (Integer cache, String intern)",
      "JavaScript V8 engine (hidden classes)",
      "Microsoft Word (glyph rendering)",
      "Unity (sprite atlas sharing)",
      "Unreal Engine (static mesh instances)",
      "Apache POI (cell style sharing)"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "performance",
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Gamma, E., Helm, R., Johnson, R., & Vlissides, J. (1994). \"Design Patterns: Elements of Reusable Object-Oriented Software\". Addison-Wesley.",
    "secondary_sources": [
      "Freeman, E. & Freeman, E. (2004). \"Head First Design Patterns\". O'Reilly Media.",
      "Bloch, J. (2018). \"Effective Java\", 3rd ed., Item 6: Avoid creating unnecessary objects. Addison-Wesley.",
      "Calder, P. & Linton, M. (1990). \"Glyphs: Flyweight Objects for User Interfaces\". UIST 1990 Proceedings."
    ],
    "typed_relations": [
      {
        "slug": "singleton-pattern",
        "type": "related"
      },
      {
        "slug": "factory-method-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 330,
    "name": "Data Transfer Object (DTO)",
    "name_zh": "数据传输对象（DTO）",
    "slug": "data-transfer-object",
    "category": "coding",
    "desc": "A simple object that carries data between processes or layers, containing no business logic — its sole purpose is to reduce the number of method calls by bundling data into a single transfer unit",
    "desc_zh": "在进程或层之间传递数据的简单对象，不包含任何业务逻辑——其唯一目的是通过将数据打包成单一传输单元来减少方法调用次数",
    "steps": [
      "Identify the data boundary: locate the interface between two layers or processes (controller ↔ service, service ↔ database, microservice ↔ microservice) where multiple data fields are exchanged repeatedly in a chatty interaction",
      "Define the DTO class: create a plain object (no business logic, no database references) containing only the fields needed by the consumer of that interface; give it a name that reflects the operation context (CreateOrderRequest, ProductSummaryResponse)",
      "Map between domain model and DTO: implement explicit mapping code (or use a mapping library such as MapStruct, AutoMapper, or class-transformer) to convert between rich domain objects and thin DTOs at the boundary — never expose domain internals directly",
      "Validate at ingress: apply input validation to DTOs arriving from external callers (HTTP request bodies, gRPC messages) before mapping them to domain objects; use validation annotations or schema validators to enforce constraints early",
      "Version and evolve DTOs independently: treat DTOs as a public API contract; add fields with defaults for backward compatibility, deprecate fields explicitly, and use versioned DTO classes or API versioning to prevent breaking existing consumers during evolution"
    ],
    "steps_zh": [
      "识别数据边界：定位两层或两个进程之间的接口（控制器 ↔ 服务、服务 ↔ 数据库、微服务 ↔ 微服务），在那里多个数据字段在频繁的交互中被反复交换",
      "定义 DTO 类：创建一个普通对象（无业务逻辑、无数据库引用），仅包含该接口消费者所需的字段；给它一个反映操作上下文的名称（CreateOrderRequest、ProductSummaryResponse）",
      "在领域模型和 DTO 之间映射：实现显式映射代码（或使用 MapStruct、AutoMapper、class-transformer 等映射库）在边界处在富领域对象和精简 DTO 之间转换——永远不要直接暴露领域内部",
      "在入口处验证：在将来自外部调用者的 DTO（HTTP 请求体、gRPC 消息）映射到领域对象之前，对其应用输入验证；使用验证注解或模式验证器尽早强制约束",
      "独立地对 DTO 进行版本控制和演进：将 DTO 视为公共 API 契约；为向后兼容添加带默认值的字段，明确弃用字段，并使用版本化 DTO 类或 API 版本控制，以防止在演进过程中破坏现有消费者"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Source Layer",
      "DTO",
      "Target Layer",
      "Serialization"
    ],
    "viz_labels_zh": [
      "源层",
      "数据传输对象",
      "目标层",
      "序列化"
    ],
    "related": [
      "repository-pattern",
      "data-mapper-pattern",
      "solid-principles",
      "clean-code-principles"
    ],
    "tags": [
      "data-transfer",
      "layered-architecture",
      "api-design",
      "decoupling",
      "serialization"
    ],
    "origin_author": "Martin Fowler, \"Patterns of Enterprise Application Architecture\", 2002",
    "origin_source": "Fowler, M. (2002). \"Patterns of Enterprise Application Architecture\". Addison-Wesley. Pattern: Data Transfer Object, pp. 401–415.",
    "origin_source_zh": "Fowler（2002）《企业应用架构模式》，Addison-Wesley，模式：数据传输对象，第 401–415 页",
    "complexity": "beginner",
    "abstraction_level": "code",
    "quality_concerns": [
      "performance",
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "when_to_use": [
      "When calling remote services or crossing process boundaries where each method call is expensive and batching multiple fields into a single round trip reduces network overhead",
      "When you need to expose only a subset of a domain model to an API consumer — the DTO acts as a projection that hides sensitive fields, computed properties, and ORM-managed relationships",
      "When different consumers (mobile app, admin dashboard, public API) need different shapes of the same underlying data, requiring separate response DTOs rather than a single bloated domain object",
      "When decoupling the public API contract from internal domain model refactoring is important — changing a domain entity should not force a breaking API change for consumers",
      "When validating and deserializing structured input from HTTP bodies, gRPC messages, or message queue payloads before passing data into domain logic"
    ],
    "when_to_use_zh": [
      "当调用远程服务或跨越进程边界时，每次方法调用都很昂贵，将多个字段打包成单一往返可减少网络开销",
      "当你需要仅向 API 消费者暴露领域模型的子集时——DTO 充当隐藏敏感字段、计算属性和 ORM 管理关系的投影",
      "当不同消费者（移动应用、管理仪表板、公共 API）需要相同底层数据的不同形状，需要单独的响应 DTO 而非单一膨胀的领域对象时",
      "当将公共 API 契约与内部领域模型重构解耦很重要时——更改领域实体不应强制消费者进行破坏性 API 更改",
      "当在将数据传入领域逻辑之前，验证和反序列化来自 HTTP 请求体、gRPC 消息或消息队列负载的结构化输入时"
    ],
    "core_concepts": [
      "Anemic object: a DTO is intentionally anemic — it holds data fields and possibly simple getters/setters but contains zero business logic; this is a feature, not a violation of OOP principles, because its purpose is transport, not behaviour",
      "Boundary mapping: explicit code that converts a rich domain object (with business rules, invariants, and ORM associations) to a flat DTO and back; the mapping layer is where field selection, renaming, and format conversion happen",
      "Serialization contract: DTOs are the objects that get serialized to JSON, XML, Protobuf, or Avro for transmission; their structure defines the wire format and must be evolved carefully to maintain backward compatibility",
      "Request vs. response DTO: inbound DTOs (requests/commands) carry client-supplied data into the application and are subject to validation; outbound DTOs (responses/views) carry application-computed data to clients and are shaped for consumer convenience",
      "Projection: a DTO that represents a subset or reshaping of the domain model for a specific use case — similar to a database view but at the application layer"
    ],
    "core_concepts_zh": [
      "贫血对象：DTO 是有意贫血的——它持有数据字段和可能的简单 getter/setter，但不包含任何业务逻辑；这是一个特性，而非 OOP 原则的违反，因为它的目的是传输而非行为",
      "边界映射：将富领域对象（具有业务规则、不变量和 ORM 关联）转换为扁平 DTO 并反向转换的显式代码；映射层是字段选择、重命名和格式转换发生的地方",
      "序列化契约：DTO 是被序列化为 JSON、XML、Protobuf 或 Avro 进行传输的对象；它们的结构定义了线格式，必须谨慎演进以保持向后兼容性",
      "请求与响应 DTO：入站 DTO（请求/命令）将客户端提供的数据带入应用程序并需经过验证；出站 DTO（响应/视图）将应用程序计算的数据传递给客户端，并为消费者方便而塑形",
      "投影：表示特定用例的领域模型子集或重塑的 DTO——类似于数据库视图，但在应用层"
    ],
    "timeline": [
      [
        "1998",
        "Java EE (J2EE) 1.2 introduces Remote Method Invocation and Enterprise JavaBeans; developers discover that fine-grained RMI calls are prohibitively slow, motivating the Transfer Object (then called Value Object) pattern to batch data"
      ],
      [
        "2002",
        "Martin Fowler formalizes the Data Transfer Object pattern in \"Patterns of Enterprise Application Architecture\", distinguishing it from Value Object and documenting the mapping layer as a first-class concern"
      ],
      [
        "2006",
        "Sun Microsystems renames the J2EE pattern from 「Transfer Object」 to 「Data Transfer Object」 in the updated Java EE patterns catalog to resolve naming conflicts with Value Object"
      ],
      [
        "2011",
        "The rise of RESTful APIs and later gRPC/Protobuf cements DTOs as the universal mechanism for defining API request and response shapes across languages and frameworks"
      ]
    ],
    "timeline_zh": [
      [
        "1998",
        "Java EE（J2EE）1.2 引入远程方法调用和企业级 JavaBeans；开发者发现细粒度 RMI 调用速度极慢，促使开发传输对象（当时称为值对象）模式来批量处理数据"
      ],
      [
        "2002",
        "Martin Fowler 在《企业应用架构模式》中正式化了数据传输对象模式，将其与值对象区分开来，并将映射层记录为一等关注点"
      ],
      [
        "2006",
        "Sun Microsystems 在更新的 Java EE 模式目录中将 J2EE 模式从「Transfer Object」重命名为「Data Transfer Object」，以解决与值对象的命名冲突"
      ],
      [
        "2011",
        "RESTful API 的兴起以及后来的 gRPC/Protobuf 使 DTO 成为跨语言和框架定义 API 请求和响应形状的通用机制"
      ]
    ],
    "dos": [
      "Do name DTOs after the operation context rather than the domain entity: use OrderCreationRequest and OrderSummaryResponse instead of OrderDTO — the operation name communicates intent and makes it obvious which direction the data flows",
      "Do keep DTOs flat and serialization-friendly: avoid nested domain objects or lazy-loaded ORM collections; map everything to primitives, value types, or explicit nested DTO classes before the boundary",
      "Do validate DTO input at the entry point using declarative validators (Bean Validation annotations in Java, class-validator in TypeScript, Pydantic in Python) before mapping to domain objects — fail fast with descriptive error messages",
      "Do use a dedicated mapping layer or library: centralize all DTO ↔ domain mappings in assembler classes, mapper functions, or a mapping library rather than embedding mapping logic in controllers or service methods"
    ],
    "dos_zh": [
      "以操作上下文而非领域实体命名 DTO：使用 OrderCreationRequest 和 OrderSummaryResponse 而非 OrderDTO——操作名称传达意图并明确显示数据流向",
      "保持 DTO 扁平化且序列化友好：避免嵌套领域对象或惰性加载的 ORM 集合；在边界之前将所有内容映射为基本类型、值类型或显式嵌套 DTO 类",
      "在入口点使用声明式验证器验证 DTO 输入（Java 中的 Bean Validation 注解、TypeScript 中的 class-validator、Python 中的 Pydantic），然后再映射到领域对象——以描述性错误消息快速失败",
      "使用专用映射层或库：将所有 DTO ↔ 领域映射集中在组装器类、映射函数或映射库中，而不是将映射逻辑嵌入控制器或服务方法中"
    ],
    "donts": [
      "Don't add business logic or validation rules to DTOs: a DTO that enforces invariants, calculates derived fields, or calls services has become a domain object in disguise — move all logic to domain entities or application services",
      "Don't reuse the same DTO for multiple unrelated operations: a single UserDTO shared between registration, profile update, and admin audit endpoints accumulates fields from all contexts, resulting in a nullable field soup that confuses every consumer",
      "Don't skip the mapping layer by returning domain entities or ORM models directly from APIs: this exposes internal structure, leaks ORM-managed fields (like database IDs, version columns, audit timestamps), and couples the API contract to database schema changes",
      "Don't create DTOs for purely in-process method calls within the same layer: DTOs solve the remote call cost problem; adding them to local service-to-service calls in the same process adds boilerplate with no performance or decoupling benefit"
    ],
    "donts_zh": [
      "不要向 DTO 添加业务逻辑或验证规则：强制执行不变量、计算派生字段或调用服务的 DTO 已经变成了伪装的领域对象——将所有逻辑移动到领域实体或应用服务",
      "不要为多个不相关的操作重用同一个 DTO：在注册、个人资料更新和管理员审计端点之间共享的单个 UserDTO 会积累所有上下文的字段，导致混淆每个消费者的可空字段汤",
      "不要通过直接从 API 返回领域实体或 ORM 模型来跳过映射层：这会暴露内部结构，泄漏 ORM 管理的字段（如数据库 ID、版本列、审计时间戳），并将 API 契约与数据库模式变更耦合",
      "不要为同一层内的纯进程内方法调用创建 DTO：DTO 解决的是远程调用成本问题；将它们添加到同一进程内的本地服务间调用会增加样板代码，而不带来任何性能或解耦优势"
    ],
    "case_study_company": "Spring Framework / Java EE ecosystem",
    "case_study": "The J2EE (Java EE) pattern catalog documented the Transfer Object (later renamed Data Transfer Object) in response to a pervasive performance problem discovered by enterprise Java developers in the late 1990s: Entity EJBs exposed fine-grained remote interfaces where fetching an Order with 10 fields required 10 separate RMI calls, each incurring network round-trip overhead. Sun Microsystems and ThoughtWorks consultants, observing this anti-pattern across dozens of projects, codified the solution — bundle all required fields into a single serializable Transfer Object returned by a coarse-grained Session Facade — and published it in the J2EE Patterns catalog (2001) and Fowler's PoEAA (2002). The pattern subsequently propagated through the Spring Framework's layered architecture recommendations, becoming the standard way to structure controller request/response objects in Spring MVC applications, NestJS decorators, and gRPC Protobuf message definitions — all of which are DTOs under different names.",
    "case_study_zh": "J2EE（Java EE）模式目录记录了传输对象（后来重命名为数据传输对象），以回应 1990 年代末企业级 Java 开发者发现的普遍性能问题：实体 EJB 暴露了细粒度远程接口，获取一个具有 10 个字段的 Order 需要 10 次单独的 RMI 调用，每次都会产生网络往返开销。Sun Microsystems 和 ThoughtWorks 顾问在数十个项目中观察到这种反模式后，将解决方案编成规范——将所有必需字段打包到由粗粒度 Session Facade 返回的单个可序列化传输对象中——并在 J2EE 模式目录（2001）和 Fowler 的 PoEAA（2002）中发布。该模式随后通过 Spring Framework 的分层架构建议传播开来，成为在 Spring MVC 应用程序、NestJS 装饰器和 gRPC Protobuf 消息定义中构建控制器请求/响应对象的标准方式——所有这些都是不同名称下的 DTO。",
    "when_not_to_use": [
      "Simple CRUD endpoints in small applications where domain model and API shape are identical and are unlikely to diverge — the mapping boilerplate adds no value when the domain entity and API contract are the same object",
      "In-process function calls within a single bounded context where both caller and callee are in the same deployment unit — DTOs solve remote call cost; using them locally adds classes without any performance or isolation benefit",
      "Rapid prototyping and early-stage development where the domain model is changing rapidly — introducing DTO mapping before the model stabilizes creates a maintenance burden that slows down iteration without improving quality",
      "GraphQL APIs where field selection is handled by the query itself — the GraphQL resolver can project fields directly from domain objects, making a separate DTO layer redundant in most cases"
    ],
    "when_not_to_use_zh": [
      "小型应用程序中的简单 CRUD 端点，其中领域模型和 API 形状相同且不太可能分歧——当领域实体和 API 契约是同一对象时，映射样板不会增加任何价值",
      "单个有界上下文内的进程内函数调用，调用者和被调用者都在同一部署单元中——DTO 解决远程调用成本；在本地使用它们只会增加类而不带来任何性能或隔离优势",
      "领域模型快速变化的快速原型和早期开发阶段——在模型稳定之前引入 DTO 映射会产生维护负担，在不提高质量的情况下减缓迭代速度",
      "GraphQL API，其中字段选择由查询本身处理——GraphQL 解析器可以直接从领域对象投影字段，在大多数情况下使单独的 DTO 层变得多余"
    ],
    "adopters": [
      "Java EE / Spring Framework",
      ".NET / ASP.NET Core (with AutoMapper)",
      "NestJS (TypeScript class-validator DTOs)",
      "gRPC (Protobuf message definitions)",
      "Django REST Framework (Serializers)"
    ],
    "primary_source": "Fowler, M. (2002). \"Patterns of Enterprise Application Architecture\". Addison-Wesley. Chapter: Distribution Patterns — Data Transfer Object.",
    "secondary_sources": [
      "Alur, D., Crupi, J. & Malks, D. (2001). \"Core J2EE Patterns: Best Practices and Design Strategies\". Sun Microsystems Press. Transfer Object pattern.",
      "Vernon, V. (2013). \"Implementing Domain-Driven Design\". Addison-Wesley. Chapter 4: Architecture.",
      "Nygard, M. (2007). \"Release It!\". Pragmatic Programmers. Chapter on stability patterns at service boundaries."
    ],
    "typed_relations": [
      {
        "slug": "repository-pattern",
        "type": "complement"
      },
      {
        "slug": "data-mapper-pattern",
        "type": "complement"
      },
      {
        "slug": "solid-principles",
        "type": "related"
      }
    ]
  },
  {
    "id": 324,
    "name": "DRY (Don't Repeat Yourself)",
    "name_zh": "DRY 原则（不要重复自己）",
    "slug": "dry-principle",
    "category": "coding",
    "desc": "Every piece of knowledge must have a single, unambiguous, authoritative representation within a system. When you find yourself writing the same code in two places, extract it into one canonical source.",
    "desc_zh": "系统中的每一条知识都必须有唯一、明确、权威性的表述。当你在两个地方写相同的代码时，就应该将其提取为唯一的权威来源。",
    "steps": [
      "Identify duplicated logic, data, or knowledge across the codebase",
      "Find the canonical location or create a single source of truth",
      "Extract the duplication into a shared abstraction (function, class, constant, template)",
      "Update all call sites to reference the single source",
      "Verify tests still pass; ensure the abstraction is not over-generalized"
    ],
    "steps_zh": [
      "识别代码库中重复的逻辑、数据或知识",
      "找到权威位置，或创建唯一信源",
      "将重复部分提取为共享抽象（函数、类、常量、模板）",
      "更新所有调用处引用唯一来源",
      "验证测试仍通过；确保抽象没有过度泛化"
    ],
    "viz_type": "pyramid",
    "viz_labels": [
      "Duplication",
      "Abstraction",
      "Single Source",
      "Reuse"
    ],
    "viz_labels_zh": [
      "代码重复",
      "抽象提取",
      "单一来源",
      "复用"
    ],
    "complexity": "beginner",
    "origin_author": "Andy Hunt & Dave Thomas, 1999",
    "adopters": [
      "Ruby on Rails（核心设计哲学）",
      "Django",
      "Spring Boot",
      "Terraform",
      "GraphQL"
    ],
    "related": [
      "solid-principles",
      "clean-code-principles",
      "functional-core-imperative-shell"
    ],
    "tags": [
      "dry",
      "code-quality",
      "maintainability",
      "abstraction",
      "refactoring"
    ],
    "ai_relevant": false,
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "when_to_use": "When you notice the same logic appears in two or more places, or when a constant, formula, or business rule is hard-coded in multiple files. Apply DRY during refactoring sprints or when onboarding reveals confusion about which copy is canonical.",
    "when_to_use_zh": "当相同逻辑出现在两个或更多地方时，或当常量、公式、业务规则被硬编码在多个文件中时。在重构冲刺或入职发现混乱时应用。",
    "core_concepts": [
      "Single Source of Truth",
      "Abstraction",
      "Knowledge Duplication vs Code Duplication",
      "Canonical Representation",
      "Rule of Three"
    ],
    "core_concepts_zh": [
      "唯一信源",
      "抽象化",
      "知识重复 vs 代码重复",
      "权威性表述",
      "三次则则"
    ],
    "timeline": [
      {
        "year": 1999,
        "event": "Andy Hunt and Dave Thomas coin the term in 「The Pragmatic Programmer」"
      },
      {
        "year": 2000,
        "event": "Ruby on Rails adopts DRY as a core design principle, popularizing it widely"
      },
      {
        "year": 2005,
        "event": "Concept expanded to infrastructure: Terraform and config-as-code embrace DRY"
      },
      {
        "year": 2015,
        "event": "GraphQL's single schema serves as the DRY contract between client and server"
      },
      {
        "year": 2020,
        "event": "DRY tension with micro-services discussed: shared libraries vs. duplication across services"
      }
    ],
    "timeline_zh": [
      {
        "year": 1999,
        "event": "Andy Hunt 和 Dave Thomas 在《务实的程序员》中提出该术语"
      },
      {
        "year": 2000,
        "event": "Ruby on Rails 将 DRY 作为核心设计哲学，广泛推广"
      },
      {
        "year": 2005,
        "event": "DRY 理念延伸到基础设施：Terraform 和配置即代码拥抱 DRY"
      },
      {
        "year": 2015,
        "event": "GraphQL 的单一 Schema 作为客户端与服务端间的 DRY 契约"
      },
      {
        "year": 2020,
        "event": "微服务中 DRY 的张力被讨论：共享库 vs 跨服务重复"
      }
    ],
    "dos": [
      "Extract shared logic into well-named functions or modules",
      "Use configuration or constants for repeated values",
      "Apply templates or code generation to eliminate structural duplication",
      "Review for duplication at the knowledge level, not just syntactic level",
      "Follow the Rule of Three: abstract on the third repetition"
    ],
    "dos_zh": [
      "将共享逻辑提取到命名良好的函数或模块中",
      "对重复的值使用配置或常量",
      "应用模板或代码生成消除结构性重复",
      "在知识层面而不仅仅是语法层面审查重复",
      "遵循三次则则：第三次重复时再抽象"
    ],
    "donts": [
      "Don't abstract too early — duplication is cheaper than the wrong abstraction",
      "Don't merge unrelated things just because they look similar",
      "Don't apply DRY across microservice boundaries at the cost of coupling",
      "Don't confuse DRY with removing all redundancy — tests intentionally repeat setup",
      "Don't DRY-up accidental duplication that may diverge in the future"
    ],
    "donts_zh": [
      "不要过早抽象 —— 重复比错误的抽象代价更低",
      "不要仅因为看起来相似就将无关的东西合并",
      "不要以小类服务共耐为代价跨服务应用 DRY",
      "不要把 DRY 与消除所有冗余混淆 —— 测试有意地重复设置",
      "不要小印偶然重复的部分，它们未来可能会分岐"
    ],
    "case_study": "Ruby on Rails embodies DRY through its convention-over-configuration philosophy. The ActiveRecord ORM eliminates duplicated SQL and model definitions; Action Pack routes map URLs to controllers without repetitive wiring; Rails generators produce standard scaffolding so developers never manually repeat boilerplate. The result: a new Rails app has almost no duplicated setup code compared to raw PHP or early Java web apps.",
    "case_study_zh": "Ruby on Rails 通过「约定优于配置」的哲学深刻体现了 DRY 原则。ActiveRecord ORM 消除了重复的 SQL 和模型定义；Action Pack 路由无需重复配线就能将 URL 映射到控制器；Rails 生成器自动生产标准脚手架代码。结果：一个新的 Rails 应用几乎没有重复的设置代码。",
    "case_study_company": "Ruby on Rails",
    "when_not_to_use": "Avoid forced DRY across microservice boundaries where coupling would be worse than duplication. Don't apply it to test setup code — explicit repetition in tests aids readability. Early-stage prototypes may be better off with duplicate code that's easier to delete.",
    "when_not_to_use_zh": "避免在微服务边界强制应用 DRY，耦合的代价将大于重复。不要应用于测试设置代码 —— 测试中显式重复有助于可读性。早期原型可能适合保留重复代码，便于删除。",
    "primary_source": "Hunt, A. & Thomas, D. (1999). 「The Pragmatic Programmer: From Journeyman to Master.」 Addison-Wesley.",
    "secondary_sources": [
      "Fowler, M. (1999). 「Refactoring: Improving the Design of Existing Code.」 Addison-Wesley.",
      "Martin, R.C. (2008). 「Clean Code.」 Prentice Hall."
    ],
    "origin_source": "Coined in 「The Pragmatic Programmer」 (1999) by Andy Hunt and Dave Thomas as one of the most important principles of software development.",
    "origin_source_zh": "来自 Andy Hunt 和 Dave Thomas《务实的程序员》（1999），被列为软件开发最重要原则之一。",
    "typed_relations": [
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "clean-code-principles",
        "type": "complement"
      },
      {
        "slug": "functional-core-imperative-shell",
        "type": "related"
      }
    ]
  },
  {
    "id": 325,
    "name": "KISS (Keep It Simple, Stupid)",
    "name_zh": "KISS 原则（保持简单，不要复杂）",
    "slug": "kiss-principle",
    "category": "coding",
    "desc": "Most systems work best if they are kept simple rather than made complicated. Complexity is the enemy of reliability. Design the simplest thing that could possibly work, and resist the temptation to add cleverness.",
    "desc_zh": "大多数系统保持简单比过度设计更有效。复杂性是可靠性的敌人。设计最简单可能工作的方案，抗拒过度巧妙的诱惑。",
    "steps": [
      "Define the minimum requirement: what must this actually do?",
      "Design the simplest solution that satisfies those requirements",
      "Remove any element that is not essential to the core requirement",
      "Ask 「would a new team member understand this in 5 minutes?」",
      "Prefer standard patterns over custom cleverness"
    ],
    "steps_zh": [
      "定义最小需求：这个实际上需要做什么？",
      "设计满足这些需求的最简单方案",
      "删除不是核心需求必需的任何元素",
      "问「新团队成员能在 5 分钟内理解这个吗？」",
      "优先使用标准模式而非自定义巧妙设计"
    ],
    "viz_type": "pyramid",
    "viz_labels": [
      "Complex",
      "Simplified",
      "Simple",
      "Essential"
    ],
    "viz_labels_zh": [
      "复杂",
      "简化",
      "简单",
      "本质"
    ],
    "complexity": "beginner",
    "origin_author": "Kelly Johnson, Lockheed Skunk Works, 1960s",
    "adopters": [
      "Google 搜索",
      "Basecamp",
      "Unix/Linux 生态系统",
      "Go 编程语言",
      "SQLite"
    ],
    "related": [
      "clean-code-principles",
      "solid-principles",
      "functional-core-imperative-shell"
    ],
    "tags": [
      "kiss",
      "simplicity",
      "code-quality",
      "maintainability",
      "design-principles"
    ],
    "ai_relevant": false,
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "usability"
    ],
    "maturity_ring": "foundational",
    "when_to_use": "When designing APIs, internal modules, or user interfaces where cognitive load matters. Apply KISS when you catch yourself adding layers of abstraction, configuration options, or extensibility hooks that the current use-case does not require.",
    "when_to_use_zh": "当设计 API、内部模块或认知负荷重要的用户界面时。当你发现自己在添加当前用例不需要的抽象层、配置项或扩展性指容时应用。",
    "core_concepts": [
      "Simplicity",
      "Cognitive Load",
      "Minimal Design",
      "Readability",
      "Occam's Razor in Engineering"
    ],
    "core_concepts_zh": [
      "简单性",
      "认知负荷",
      "最小化设计",
      "可读性",
      "工程界的奥卡姆剃刀"
    ],
    "timeline": [
      {
        "year": 1960,
        "event": "Kelly Johnson coins 「Keep It Simple, Stupid」 as a design principle at Lockheed Skunk Works"
      },
      {
        "year": 1978,
        "event": "Unix philosophy echoes KISS: 「Do one thing and do it well」"
      },
      {
        "year": 1998,
        "event": "Google Search launches with deliberately minimal UI, defying portal-era complexity"
      },
      {
        "year": 2009,
        "event": "Go language designed with KISS in mind: no generics, no exceptions, explicit error handling"
      },
      {
        "year": 2012,
        "event": "KISS resurfaces in API design debates: REST vs SOAP simplicity argument resolved by industry"
      }
    ],
    "timeline_zh": [
      {
        "year": 1960,
        "event": "Kelly Johnson 在洛克希德黄色作品部提出「保持简单」设计原则"
      },
      {
        "year": 1978,
        "event": "Unix 哲学呼应 KISS：「只做一件事并做好」"
      },
      {
        "year": 1998,
        "event": "Google 搜索以故意简洁的界面上线，挑战了门户网站时代的复杂性"
      },
      {
        "year": 2009,
        "event": "Go 语言以 KISS 为理念设计：无泛型、无异常、显式错误处理"
      },
      {
        "year": 2012,
        "event": "KISS 在 API 设计论争中再度兴起：REST vs SOAP 简单性之争由行业内部定论"
      }
    ],
    "dos": [
      "Start with the simplest possible implementation",
      "Prefer standard library solutions over custom implementations",
      "Write code that reads like prose — minimize indirection",
      "Limit function parameters; prefer small, focused functions",
      "Question every layer of abstraction: does it genuinely reduce complexity?"
    ],
    "dos_zh": [
      "从最简单的实现开始",
      "优先使用标准库解决方案而非自定义实现",
      "写出像散文一样可读的代码 —— 减少间接层",
      "限制函数参数数量；优先小而专注的函数",
      "质疑每一个抽象层：它真的降低了复杂性吗？"
    ],
    "donts": [
      "Don't add flexibility 「just in case」 — build for now, refactor later",
      "Don't use complex design patterns when a plain function would suffice",
      "Don't mistake terseness for simplicity — one-liners can be hard to read",
      "Don't build framework-like abstractions inside application code",
      "Don't let tooling complexity (CI, Docker) negate the simplicity gains in code"
    ],
    "donts_zh": [
      "不要「以防万一」为由添加灵活性 —— 先满足当前，局后再重构",
      "当一个普通函数就能解决时，不要使用复杂的设计模式",
      "不要把简洁和简短混淆 —— 一行式代码可能难以阅读",
      "不要在应用程序代码内部构建类似框架的抽象",
      "不要让工具复杂性（CI、Docker）抗消代码中的简单性收益"
    ],
    "case_study": "Google Search launched in 1998 with a single text box on a white page — a radical departure from Yahoo and AltaVista's crowded portal homepages. This KISS-driven design reduced cognitive load, focused user intent, and became iconic. Internally, Google's PageRank algorithm was also conceptually simple: count links as votes. The combination of simple UI and simple-to-explain algorithm built the world's dominant search engine.",
    "case_study_zh": "Google 搜索于 1998 年以白页上一个文本框上线 —— 这与 Yahoo 和 AltaVista 拥挤的门户页形成强烈对比。这一符合 KISS 的设计降低了认知负荷，聚焦用户意图，成为标志性设计。PageRank 算法也概念简单：将链接计为投票。简单 UI 与可解释算法的结合造就了全球最强搜索引擎。",
    "case_study_company": "Google Search",
    "when_not_to_use": "KISS should not be used to justify under-engineering. Security, concurrency, and fault-tolerance genuinely require complexity. Don't apply KISS to dismiss necessary abstractions that protect long-term maintainability.",
    "when_not_to_use_zh": "KISS 不应被用来为欠工进行辩护。安全性、并发和容错能力确实需要复杂性。不要用 KISS 来屈新必要的抽象。",
    "primary_source": "Kelly Johnson's design principle documented in Lockheed Skunk Works engineering history (1960s). Popularized in software by McIlroy's Unix philosophy (1978).",
    "secondary_sources": [
      "Raymond, E.S. (2003). 「The Art of Unix Programming.」 Addison-Wesley.",
      "Martin, R.C. (2008). 「Clean Code.」 Prentice Hall."
    ],
    "origin_source": "Coined by Kelly Johnson at Lockheed Skunk Works in the 1960s as an aerospace engineering design guideline; adapted to software through Unix philosophy.",
    "origin_source_zh": "由 Kelly Johnson 在 20 世纪 60 年代洛克希德黄色作品部提出，最初为航空工程设计准则，后通过 Unix 哲学应用于软件领域。",
    "typed_relations": [
      {
        "slug": "clean-code-principles",
        "type": "complement"
      },
      {
        "slug": "solid-principles",
        "type": "related"
      },
      {
        "slug": "functional-core-imperative-shell",
        "type": "complement"
      }
    ]
  },
  {
    "id": 326,
    "name": "YAGNI (You Aren't Gonna Need It)",
    "name_zh": "YAGNI 原则（你不会用到它的）",
    "slug": "yagni-principle",
    "category": "coding",
    "desc": "Always implement things when you actually need them, never when you just foresee that you might need them. Premature generalization is as harmful as premature optimization.",
    "desc_zh": "只在真正需要时才实现功能，不要因为预见未来可能需要就提前实现。过早泛化与过早优化同样有害。",
    "steps": [
      "Identify what the current story or requirement actually specifies",
      "Implement only that requirement — nothing more",
      "Resist adding hooks, extension points, or config flags 「for later」",
      "Review your PR: is every line driven by a real, current requirement?",
      "Trust that the codebase will be refactorable when the need actually arises"
    ],
    "steps_zh": [
      "确认当前用户故事或需求实际指定了什么",
      "只实现该需求 —— 不多不少",
      "抗拒为「日后」添加钩子、扩展点或配置开关",
      "审查你的 PR：每一行是否都有真实的当前需求驱动？",
      "相信当需求真正来临时代码库可以重构"
    ],
    "viz_type": "flow",
    "viz_labels": [
      "Requirement",
      "Build Now",
      "Defer",
      "Avoid Waste"
    ],
    "viz_labels_zh": [
      "需求",
      "当前构建",
      "推迟实现",
      "避免浪费"
    ],
    "complexity": "beginner",
    "origin_author": "Ron Jeffries, Extreme Programming community, late 1990s",
    "adopters": [
      "Basecamp / 37signals",
      "ThoughtWorks",
      "Pivotal Labs",
      "Shopify",
      "Stack Overflow"
    ],
    "related": [
      "solid-principles",
      "clean-code-principles",
      "gof-design-patterns"
    ],
    "tags": [
      "yagni",
      "xp",
      "agile",
      "over-engineering",
      "simplicity"
    ],
    "ai_relevant": false,
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "performance"
    ],
    "maturity_ring": "foundational",
    "when_to_use": "Apply YAGNI whenever you catch yourself adding code that is not required by the current iteration. Especially useful in agile teams where requirements are refined over time and future needs are uncertain.",
    "when_to_use_zh": "当你发现自己正在添加当前迭代不需要的代码时应用。在需求随时间深化、未来不确定的敏捷团队中尤为适用。",
    "core_concepts": [
      "Incremental Development",
      "Premature Generalization",
      "Cost of Unused Code",
      "Iterative Delivery",
      "Extreme Programming (XP)"
    ],
    "core_concepts_zh": [
      "增量开发",
      "过早泛化",
      "无用代码的成本",
      "迭代交付",
      "极限编程（XP）"
    ],
    "timeline": [
      {
        "year": 1999,
        "event": "Ron Jeffries introduces YAGNI as a core Extreme Programming practice"
      },
      {
        "year": 2001,
        "event": "Agile Manifesto formally embraces incremental delivery, reinforcing YAGNI"
      },
      {
        "year": 2004,
        "event": "Basecamp / 37signals publish 「Getting Real」 — anti-over-engineering manifesto echoing YAGNI"
      },
      {
        "year": 2010,
        "event": "Lean Startup popularizes 「MVP」 — YAGNI at product level"
      },
      {
        "year": 2018,
        "event": "GitHub study on feature usage: ~64% of features in enterprise software rarely or never used"
      }
    ],
    "timeline_zh": [
      {
        "year": 1999,
        "event": "Ron Jeffries 将 YAGNI 作为极限编程的核心实践引入"
      },
      {
        "year": 2001,
        "event": "敏捷宣言正式拥抱增量交付，强化 YAGNI"
      },
      {
        "year": 2004,
        "event": "Basecamp / 37signals 出版《回归真实》—— 反过度工程宣言，呼应 YAGNI"
      },
      {
        "year": 2010,
        "event": "精益创业推广「MVP」—— YAGNI 在产品层面的体现"
      },
      {
        "year": 2018,
        "event": "GitHub 研究显示：企业软件中约64%的功能很少使用或从未使用"
      }
    ],
    "dos": [
      "Implement only what the current requirement demands",
      "Delete speculative code when cleaning up branches",
      "Use feature flags to defer decisions rather than building unused paths",
      "Communicate trade-offs to stakeholders: 「we can add X when we need it」",
      "Review PRs for 「just in case」 code and flag it"
    ],
    "dos_zh": [
      "只实现当前需求所要求的功能",
      "清理分支时删除投机性代码",
      "用功能开关推迟决策，而不是构建无用路径",
      "就权衡进行沟通：「需要时我们再加 X」",
      "审查 PR 中的「以备不时之需」代码并指出"
    ],
    "donts": [
      "Don't add plugin systems, strategy hooks, or 「extensible」 layers before they're requested",
      "Don't write adapter code for hypothetical future data sources",
      "Don't generalize to N cases when only 1 or 2 are needed today",
      "Don't keep dead code paths 「in case we need them later」",
      "Don't confuse YAGNI with avoiding good architecture — basic layering is not premature"
    ],
    "donts_zh": [
      "不要在尚未请求前添加插件系统、策略钩子或「可扩展」层",
      "不要为假设的未来数据源编写适配器代码",
      "当只需要 1 或 2 个情况时，不要泛化为 N 种情况",
      "不要保留死代码路径「以防后来需要」",
      "不要把 YAGNI 与避免良好架构混淆 —— 基础分层不是过早优化"
    ],
    "case_study": "Basecamp (formerly 37signals) built its project management product by religiously following YAGNI. Their book 「Getting Real」 documents how features were only built when customers demanded them, not anticipated. The result was a lean product that shipped faster, had fewer bugs, and cost less to maintain. Competitor products loaded with 「maybe useful」 features were harder to learn and slower to develop.",
    "case_study_zh": "Basecamp（前身 37signals）在构建项目管理产品时严格遵循 YAGNI。其《回归真实》记录了只有当客户明确要求时才构建功能的做法。结果是一个交付更快、缺陷更少、维护成本更低的精类产品。",
    "case_study_company": "Basecamp (37signals)",
    "when_not_to_use": "Security and compliance requirements must be built in from the start — YAGNI does not apply. Infrastructure choices (database engine, message queue) have high switching costs and deserve some forethought. YAGNI applies to feature code, not foundational architectural decisions.",
    "when_not_to_use_zh": "安全合规需求必须从一开始就内置——YAGNI 不适用于此。基础设施选择（数据库引擎、消息队列）切换成本高，应有一定先见。YAGNI 针对功能代码，而非基础架构决策。",
    "primary_source": "Jeffries, R. (1999). Extreme Programming Installed. Addison-Wesley.",
    "secondary_sources": [
      "Fried, J. & Heinemeier Hansson, D. (2010). 「Rework.」 Crown Business.",
      "Beck, K. (2000). 「Extreme Programming Explained.」 Addison-Wesley."
    ],
    "origin_source": "Coined by Ron Jeffries as part of Extreme Programming (XP) in the late 1990s; popularized through the XP community and the Agile movement.",
    "origin_source_zh": "由 Ron Jeffries 在 20 世纪 90 年代末作为极限编程（XP）的一部分提出，并通过 XP 社区和敏捷运动广泛传播。",
    "typed_relations": [
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "clean-code-principles",
        "type": "complement"
      },
      {
        "slug": "gof-design-patterns",
        "type": "related"
      }
    ]
  },
  {
    "id": 327,
    "name": "Composition over Inheritance",
    "name_zh": "组合优于继承",
    "slug": "composition-over-inheritance",
    "category": "coding",
    "desc": "Favor object composition over class inheritance to achieve code reuse and polymorphism. Inheritance creates tight coupling between parent and child classes; composition assembles behavior from interchangeable parts, making systems more flexible and testable.",
    "desc_zh": "为了实现代码复用和多态，应优先采用对象组合而非类继承。继承在父子类之间建立紧耦合；组合则将行为从可互换的部件中组装，使系统更灵活、可测试。",
    "steps": [
      "Identify behavior that is being shared via inheritance",
      "Extract that behavior into a small, focused interface or class (a 「component」)",
      "Compose the target class by holding a reference to that component",
      "Inject the component via constructor or setter for testability",
      "Verify that the composed class can swap implementations at runtime or test time"
    ],
    "steps_zh": [
      "识别通过继承共享的行为",
      "将该行为提取为小而专注的接口或类（一个「组件」）",
      "通过持有该组件的引用来组合目标类",
      "通过构造函数或 setter 注入组件以提高可测试性",
      "验证组合类可在运行时或测试时替换实现"
    ],
    "viz_type": "tree",
    "viz_labels": [
      "Base Type",
      "Composed Part",
      "Behavior",
      "Flexibility"
    ],
    "viz_labels_zh": [
      "基础类型",
      "组合部件",
      "行为",
      "灵活性"
    ],
    "complexity": "intermediate",
    "origin_author": "Gang of Four (Gamma, Helm, Johnson, Vlissides), 1994",
    "adopters": [
      "React / Meta（组件模型）",
      "Go（无类继承）",
      "Rust（trait 组合）",
      "Spring Framework",
      "Unity 游戏引擎"
    ],
    "related": [
      "solid-principles",
      "gof-design-patterns",
      "decorator-pattern"
    ],
    "tags": [
      "composition",
      "inheritance",
      "oop",
      "design-principles",
      "testability"
    ],
    "ai_relevant": false,
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "testability"
    ],
    "maturity_ring": "foundational",
    "when_to_use": "When you find yourself duplicating methods across sibling classes, or when deep inheritance hierarchies make changes risky. Apply when you need to mix and match behaviors dynamically, or when unit testing is made difficult by inherited state.",
    "when_to_use_zh": "当你发现自己在兄弟类之间重复方法，或当深层继承层次使变更变得危险时。当需要动态混合行为，或继承的状态妨碍了单元测试时应用。",
    "core_concepts": [
      "Has-a vs Is-a",
      "Delegation",
      "Interface Segregation",
      "Dependency Injection",
      "Mixin / Trait Pattern"
    ],
    "core_concepts_zh": [
      "Has-a vs Is-a 关系",
      "委托",
      "接口隔离",
      "依赖注入",
      "Mixin / Trait 模式"
    ],
    "timeline": [
      {
        "year": 1994,
        "event": "Gang of Four formally states 「Favor composition over inheritance」 in Design Patterns"
      },
      {
        "year": 2000,
        "event": "Java's interface + delegation pattern becomes the standard way to avoid deep hierarchies"
      },
      {
        "year": 2009,
        "event": "Go launches without class inheritance — composition is the only mechanism"
      },
      {
        "year": 2013,
        "event": "React introduces component composition model, making it mainstream in UI development"
      },
      {
        "year": 2015,
        "event": "Rust stabilizes, proving trait composition can replace inheritance across systems programming"
      }
    ],
    "timeline_zh": [
      {
        "year": 1994,
        "event": "GoF 在《设计模式》中正式提出「组合优于继承」"
      },
      {
        "year": 2000,
        "event": "Java 接口 + 委托模式成为避免深层次继承的标准方式"
      },
      {
        "year": 2009,
        "event": "Go 在没有类继承的情况下发布 —— 组合是唯一机制"
      },
      {
        "year": 2013,
        "event": "React 引入组件组合模型，使其在 UI 开发中成为主流"
      },
      {
        "year": 2015,
        "event": "Rust 稳定发布，证明 trait 组合可在系统编程中取代继承"
      }
    ],
    "dos": [
      "Model 「has-a」 relationships with composition, 「is-a」 with inheritance",
      "Keep composed components small and single-purpose",
      "Use dependency injection to supply composed parts — aids testability",
      "Prefer interfaces/protocols over concrete base classes",
      "Use mixins or traits for horizontal behavior sharing"
    ],
    "dos_zh": [
      "用组合建模「has-a」关系，用继承建模「is-a」关系",
      "保持组合组件小而单一职责",
      "使用依赖注入提供组合部分 —— 有助于可测试性",
      "优先使用接口/协议而非具体基类",
      "使用 mixin 或 trait 进行水平行为共享"
    ],
    "donts": [
      "Don't inherit just to reuse a few methods — extract and compose instead",
      "Don't create inheritance hierarchies deeper than 2-3 levels",
      "Don't use composition just to avoid thinking about the domain model",
      "Don't over-decompose: too many tiny components create their own complexity",
      "Don't forget that some 「is-a」 relationships are genuine and inheritance is correct"
    ],
    "donts_zh": [
      "不要仅为了复用少数方法就继承 —— 提取并组合",
      "不要创建超过 2-3 层的继承层次",
      "不要仅为了避免思考领域模型而使用组合",
      "不要过度拆分：太多小组件会产生其自身的复杂性",
      "不要忘记有些「is-a」关系是真实存在的，这种情况下继承是正确的"
    ],
    "case_study": "React's component model is the most mainstream example of composition over inheritance in UI development. Prior frameworks like Backbone used inheritance chains; React replaced this with composable functional components. A 「Button」 can compose an 「Icon」 and a 「Label」 without any class hierarchy. React's own documentation explicitly states 「At Facebook, we use React in thousands of components, and we haven't found any use cases where we would recommend creating component inheritance hierarchies.」",
    "case_study_zh": "React 的组件模型是 UI 开发中组合优于继承最主流的范例。Backbone 等早期框架使用继承链；React 将其替换为可组合的函数式组件。一个「Button」可以组合「Icon」和「Label」，无需任何类层次。React 官方文档明确表示：「在 Facebook，我们在数千个组件中使用 React，未发现任何需要创建组件继承层次的场景。」",
    "case_study_company": "React (Meta)",
    "when_not_to_use": "Don't avoid inheritance dogmatically. When the 「is-a」 relationship is stable and domain-accurate (Animal → Dog), shallow inheritance is simpler. Language-level constructs like abstract base classes in Python or interfaces in Java are often the right tool.",
    "when_not_to_use_zh": "不要教条地排斥继承。当「is-a」关系稳定且领域准确时（如动物→狗），浅层继承更简单。Python 抽象基类或 Java 接口等语言级构造往往是正确选择。",
    "primary_source": "Gamma, E., Helm, R., Johnson, R., & Vlissides, J. (1994). 「Design Patterns: Elements of Reusable Object-Oriented Software.」 Addison-Wesley.",
    "secondary_sources": [
      "Martin, R.C. (2008). 「Clean Code.」 Prentice Hall.",
      "React Documentation (2023). 「Composition vs Inheritance.」 https://react.dev"
    ],
    "origin_source": "Formally stated by the Gang of Four in Design Patterns (1994): 「Favor object composition over class inheritance.」",
    "origin_source_zh": "由 GoF 在《设计模式》（1994）中正式表述：「优先采用对象组合而非类继承。」",
    "typed_relations": [
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "gof-design-patterns",
        "type": "extends"
      },
      {
        "slug": "decorator-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 328,
    "name": "Law of Demeter (Principle of Least Knowledge)",
    "name_zh": "迪米特得法则（最少知识原则）",
    "slug": "law-of-demeter",
    "category": "coding",
    "desc": "A module should not know about the internal workings of the objects it manipulates. An object should only call methods on: itself, its parameters, objects it creates, and its direct component objects — never on objects returned by those calls.",
    "desc_zh": "模块不应该了解其操作对象的内部运作。一个对象只应调用：它自身、它的参数、它创建的对象以及它直接的组件对象的方法——而不是这些调用返回的对象。",
    "steps": [
      "Review each method call chain (a.b().c().d()) and flag violations",
      "Identify what knowledge is being reached through the chain",
      "Add a method to the intermediate object that performs the required operation internally",
      "Have the caller invoke that new method instead of traversing the chain",
      "Verify the caller no longer knows about the internal structure"
    ],
    "steps_zh": [
      "审查每个方法调用链（a.b().c().d()）并标记违规",
      "识别通过调用链访问了什么知识",
      "向中间对象添加一个在内部执行所需操作的方法",
      "让调用方调用该新方法而非遍历调用链",
      "验证调用方不再知道内部结构"
    ],
    "viz_type": "tree",
    "viz_labels": [
      "Object",
      "Direct Neighbor",
      "Method Call",
      "Coupling"
    ],
    "viz_labels_zh": [
      "对象",
      "直接邻居",
      "方法调用",
      "耦合度"
    ],
    "complexity": "intermediate",
    "origin_author": "Karl Lieberherr, Ian Holland, Arthur Riel — Northeastern University, 1987",
    "adopters": [
      "Java 最佳实践（Spring、JPA）",
      "C# / .NET",
      "Python",
      "Ruby on Rails",
      "AWS SDK v2"
    ],
    "related": [
      "solid-principles",
      "clean-code-principles",
      "strategy-pattern"
    ],
    "tags": [
      "law-of-demeter",
      "coupling",
      "encapsulation",
      "oop",
      "maintainability"
    ],
    "ai_relevant": false,
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "testability"
    ],
    "maturity_ring": "foundational",
    "when_to_use": "When you see long method chains that reach deep into object graphs. Apply when changing an internal object requires updating many call sites. Essential when designing APIs intended for external consumption where encapsulation is critical.",
    "when_to_use_zh": "当你看到深入对象图调用链时。当修改内部对象需要更新大量调用处时应用。在设计外部 API 时，封装至关重要。",
    "core_concepts": [
      "Coupling",
      "Encapsulation",
      "Tell, Don't Ask",
      "Information Hiding",
      "Structural Coupling"
    ],
    "core_concepts_zh": [
      "耦合",
      "封装",
      "「命令，而非查询」",
      "信息隐藏",
      "结构耦合"
    ],
    "timeline": [
      {
        "year": 1987,
        "event": "Karl Lieberherr, Ian Holland, and Arthur Riel publish 「Object-Oriented Programming: An Objective Sense of Style」 at OOPSLA, introducing the Law of Demeter"
      },
      {
        "year": 1989,
        "event": "Lieberherr and Holland formalize it as a style rule for object-oriented design"
      },
      {
        "year": 1999,
        "event": "Hunt & Thomas reframe it in 「The Pragmatic Programmer」 as 「Principle of Least Knowledge」"
      },
      {
        "year": 2004,
        "event": "Martin Fowler discusses 「Tell, Don't Ask」 as the complementary behavioral pattern"
      },
      {
        "year": 2008,
        "event": "Robert Martin cites LoD in 「Clean Code」, cementing it in mainstream software craftsmanship"
      }
    ],
    "timeline_zh": [
      {
        "year": 1987,
        "event": "Karl Lieberherr、Ian Holland 和 Arthur Riel 在 OOPSLA 发表论文，引入迪米特得法则"
      },
      {
        "year": 1989,
        "event": "Lieberherr 和 Holland 将其正式化为面向对象设计的样式规则"
      },
      {
        "year": 1999,
        "event": "Hunt & Thomas 在《务实的程序员》中将其重新表述为「最小知识原则」"
      },
      {
        "year": 2004,
        "event": "Martin Fowler 讨论「命令，而非查询」作为补充的行为模式"
      },
      {
        "year": 2008,
        "event": "Robert Martin 在《洁净代码》中引用 LoD，将其巩固入主流软件工艺"
      }
    ],
    "dos": [
      "Add 「tell」 methods to intermediate objects that encapsulate the traversal",
      "Use the 「Tell, Don't Ask」 principle to command objects rather than query their internals",
      "Keep object graphs shallow — deep graphs signal design problems",
      "Design APIs that return value objects or DTOs instead of exposing internal references",
      "Write unit tests that expose chain violations by requiring complex mock setups"
    ],
    "dos_zh": [
      "向中间对象添加封装遍历的「命令」方法",
      "用「命令，而非查询」原则指挥对象而非查询其内部状态",
      "保持对象图浅层 —— 深层结构暗示设计问题",
      "设计返回值对象或 DTO 而非暴露内部引用的 API",
      "编写单元测试以展露链式调用违规（复杂模拟设置为信号）"
    ],
    "donts": [
      "Don't write 「train wrecks」: a.getB().getC().doSomething()",
      "Don't expose internal collections directly — return views or copies",
      "Don't violate LoD just for convenience in scripts or one-off code",
      "Don't confuse LoD with avoiding all chaining — fluent builders on the same object are fine",
      "Don't add trivial delegate methods just to satisfy the law — use judgment"
    ],
    "donts_zh": [
      "不要写「火车炸毁」式调用：a.getB().getC().doSomething()",
      "不要直接暴露内部集合 —— 返回视图或副本",
      "不要仅为了便利就在脚本或一次性代码中违反 LoD",
      "不要把 LoD 与避免所有链式调用混淆 —— 对同一对象的流式构建器是允许的",
      "不要仅为满足该法则就添加琐小的委托方法 —— 需要判断"
    ],
    "case_study": "The AWS SDK v2 for Java was redesigned with LoD in mind. The v1 SDK exposed deeply nested configuration objects, leading to call chains like client.getConfig().getCredentials().getAWSAccessKeyId(). SDK v2 replaced this with immutable builder objects and explicit credential providers, so callers interact only with the top-level client. This made the SDK far easier to mock in unit tests and eliminated coupling to internal AWS service structures.",
    "case_study_zh": "Java AWS SDK v2 就是考虑到 LoD 而重设计的。v1 SDK 暴露了深层嵌套的配置对象，导致如 client.getConfig().getCredentials().getAWSAccessKeyId() 的调用链。SDK v2 用不可变构建器对象和显式凭证提供者取代了这些，调用方只需与顶层客户端交互。这使 SDK 在单元测试中更易于 mock，并消除了对内部 AWS 服务结构的耦合。",
    "case_study_company": "Amazon (AWS SDK)",
    "when_not_to_use": "Don't apply LoD so strictly that you create 「wrapper hell」 — dozens of trivial delegate methods. In data pipeline or ETL code, chained transformations on the same object are idiomatic. Framework internals often need direct graph access for performance.",
    "when_not_to_use_zh": "不要将 LoD 应用得如此严格以至创造「包装地狱」——数十个琐小的委托方法。在数据流水线或 ETL 代码中，对同一对象进行链式转换是惯用写法。框架内部组件出于性能考虑通常需要直接访问图结构。",
    "primary_source": "Lieberherr, K., Holland, I., & Riel, A. (1988). 「Object-Oriented Programming: An Objective Sense of Style.」 OOPSLA '87 Proceedings.",
    "secondary_sources": [
      "Hunt, A. & Thomas, D. (1999). 「The Pragmatic Programmer.」 Addison-Wesley.",
      "Martin, R.C. (2008). 「Clean Code.」 Prentice Hall."
    ],
    "origin_source": "Presented at OOPSLA 1987 by Lieberherr, Holland, and Riel at Northeastern University. Named 「Demeter」 after the Greek goddess of agriculture and a research project at Northeastern.",
    "origin_source_zh": "由 Lieberherr、Holland 和 Riel 在东北大学于 1987 年 OOPSLA 上发表。名称「迪米特」源自希腊农业女神和东北大学的一个研究项目。",
    "typed_relations": [
      {
        "slug": "solid-principles",
        "type": "complement"
      },
      {
        "slug": "clean-code-principles",
        "type": "complement"
      },
      {
        "slug": "strategy-pattern",
        "type": "related"
      }
    ]
  },
  {
    "id": 331,
    "name": "Active Record Pattern",
    "name_zh": "活动记录模式",
    "slug": "active-record-pattern",
    "category": "coding",
    "desc": "Domain object that wraps a database row and encapsulates CRUD logic within itself",
    "desc_zh": "将数据库行封装并在对象内部实现 CRUD 逻辑的领域对象模式",
    "steps": [
      "Map a class to a database table: each class property corresponds to a column, and each instance represents exactly one row in that table",
      "Embed persistence methods on the object: implement save(), find(), update(), and delete() directly on the domain class so callers never touch SQL directly",
      "Use class-level finders for queries: expose static or class methods like User.findByEmail() that translate to SQL SELECT and return hydrated objects",
      "Leverage lifecycle callbacks: hook before_save, after_create, and similar callbacks on the class to enforce validation, timestamps, and side-effects",
      "Migrate schema changes alongside code: keep database migrations versioned next to the model files so schema and behaviour evolve together"
    ],
    "steps_zh": [
      "将类映射到数据库表：每个类属性对应一列，每个实例代表该表中的一行",
      "在对象上内嵌持久化方法：直接在领域类上实现 save()、find()、update() 和 delete()，调用方无需直接接触 SQL",
      "使用类级别查询方法：暴露 User.findByEmail() 等静态或类方法，将其转换为 SQL SELECT 并返回填充好的对象",
      "利用生命周期回调：在类上挂载 before_save、after_create 等回调，以强制验证、时间戳和副作用",
      "随代码一起迁移模式变更：将数据库迁移文件版本化并置于模型文件旁，使模式与行为同步演进"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Domain Object",
      "Database Table",
      "CRUD",
      "Query"
    ],
    "viz_labels_zh": [
      "领域对象",
      "数据库表",
      "增删改查",
      "查询"
    ],
    "related": [
      "repository-pattern",
      "data-mapper-pattern",
      "data-transfer-object"
    ],
    "tags": [
      "active-record",
      "orm",
      "persistence",
      "database"
    ],
    "origin_author": "Martin Fowler, 2002",
    "origin_source": "Patterns of Enterprise Application Architecture",
    "origin_source_zh": "《企业应用架构模式》",
    "complexity": "beginner",
    "when_to_use": [
      "Building CRUD-heavy applications where domain logic is thin and closely mirrors the database schema",
      "Rapid prototyping or small-to-medium web applications where development speed outweighs architectural purity",
      "Teams that want a single cohesive object to serve as both domain model and persistence layer without a separate repository layer",
      "Applications using convention-over-configuration frameworks such as Rails or Laravel where Active Record is the idiomatic approach"
    ],
    "when_to_use_zh": [
      "构建以 CRUD 为主、领域逻辑较薄且紧密映射数据库模式的应用",
      "快速原型或中小型 Web 应用，开发速度优先于架构纯粹性",
      "希望用单一内聚对象同时充当领域模型和持久化层、无需独立仓储层的团队",
      "使用约定优于配置框架（如 Rails 或 Laravel）且 Active Record 是惯用方法的应用"
    ],
    "core_concepts": [
      "Table-per-class mapping: each Active Record class maps directly to one database table with no intermediate mapping layer",
      "Self-contained persistence: the domain object owns its own SQL or query-builder calls, eliminating the need for a separate DAO or repository class",
      "Lifecycle callbacks: hooks executed automatically at create, update, and delete events allow cross-cutting concerns like validation and auditing to live on the model",
      "Finder methods: class-level query methods return pre-hydrated objects, encapsulating SQL behind a domain-friendly API",
      "Convention over configuration: naming conventions (plural table name, id primary key) eliminate boilerplate mapping configuration"
    ],
    "core_concepts_zh": [
      "每类一表映射：每个活动记录类直接映射到一张数据库表，无需中间映射层",
      "自包含持久化：领域对象拥有自己的 SQL 或查询构建器调用，无需独立的 DAO 或仓储类",
      "生命周期回调：在创建、更新、删除事件时自动触发的钩子，使验证和审计等横切关注点驻留在模型上",
      "查询方法：类级别查询方法返回预填充对象，将 SQL 封装在友好的领域 API 之后",
      "约定优于配置：命名约定（复数表名、id 主键）消除了样板映射配置"
    ],
    "timeline": [
      [
        "2002",
        "Martin Fowler names and documents the Active Record pattern in 'Patterns of Enterprise Application Architecture'"
      ],
      [
        "2004",
        "Ruby on Rails ships ActiveRecord as its default ORM, popularising the pattern worldwide"
      ],
      [
        "2011",
        "Laravel introduces Eloquent ORM, bringing Active Record to the PHP community with an expressive fluent API"
      ],
      [
        "2016",
        "Active Record remains dominant in web frameworks while Data Mapper alternatives gain traction for complex domains"
      ]
    ],
    "timeline_zh": [
      [
        "2002",
        "Martin Fowler 在《企业应用架构模式》中命名并记录活动记录模式"
      ],
      [
        "2004",
        "Ruby on Rails 将 ActiveRecord 作为默认 ORM 发布，使该模式在全球流行"
      ],
      [
        "2011",
        "Laravel 引入 Eloquent ORM，以富有表现力的流式 API 将活动记录带入 PHP 社区"
      ],
      [
        "2016",
        "活动记录在 Web 框架中仍占主导地位，而数据映射器替代方案因复杂领域逐渐获得关注"
      ]
    ],
    "dos": [
      "Do use Active Record for straightforward CRUD where the domain model closely reflects the table structure",
      "Do leverage lifecycle callbacks for validation and automatic timestamp management rather than duplicating logic in service layers",
      "Do rely on framework-provided migration tooling to keep schema and model in sync across environments",
      "Do add scopes or named query methods to encapsulate common filters instead of scattering raw queries across the codebase"
    ],
    "dos_zh": [
      "对于领域模型与表结构高度一致的直接 CRUD，使用活动记录",
      "利用生命周期回调进行验证和自动时间戳管理，而非在服务层重复逻辑",
      "依赖框架提供的迁移工具跨环境保持模式与模型同步",
      "添加作用域或命名查询方法封装常用过滤条件，而非将裸查询分散在代码库中"
    ],
    "donts": [
      "Don't use Active Record for complex domains with rich business rules — the coupling between persistence and behaviour becomes a maintenance burden",
      "Don't write business logic inside lifecycle callbacks because it becomes invisible to callers and hard to test in isolation",
      "Don't share Active Record objects across service boundaries as DTOs because it leaks persistence details and schema changes cascade unexpectedly",
      "Don't let models accumulate hundreds of methods — extract service objects or use Data Mapper when a model grows beyond a single responsibility"
    ],
    "donts_zh": [
      "不要在具有丰富业务规则的复杂领域中使用活动记录——持久化与行为的耦合会成为维护负担",
      "不要在生命周期回调中编写业务逻辑，因为这对调用方不可见且难以独立测试",
      "不要将活动记录对象作为 DTO 跨服务边界共享，因为这会泄漏持久化细节，导致模式变更意外级联",
      "不要让模型积累数百个方法——当模型超出单一职责时，改用服务对象或数据映射器"
    ],
    "case_study_company": "Shopify",
    "case_study": "Shopify's core monolith is built on Ruby on Rails and uses ActiveRecord throughout. Each core entity — Order, Product, Customer, Variant — is an Active Record model mapped directly to a database table. This allowed Shopify to move extremely fast in its early years, shipping new merchant features weekly. As the platform scaled to millions of merchants, the team learned to discipline Active Record usage: models were forbidden from calling other models' persistence methods directly, and background jobs were kept out of callbacks. These constraints preserved the productivity benefits while avoiding the worst coupling pitfalls of naive Active Record usage.",
    "case_study_zh": "Shopify 的核心单体应用基于 Ruby on Rails 构建，全面使用 ActiveRecord。每个核心实体——订单、商品、客户、变体——都是直接映射到数据库表的活动记录模型。这使 Shopify 在早期能够快速迭代，每周发布新的商家功能。随着平台扩展到数百万商家，团队学会了规范活动记录的使用：禁止模型直接调用其他模型的持久化方法，后台任务也不放入回调中。这些约束在保留生产力优势的同时，避免了朴素活动记录使用中最严重的耦合陷阱。",
    "when_not_to_use": [
      "Complex domains with deep business rules that do not align cleanly with a single table — Data Mapper with a dedicated domain layer is more appropriate",
      "Microservices or shared libraries where exposing database-coupled objects across service boundaries introduces unwanted coupling",
      "Applications requiring multiple persistence backends (e.g., storing parts of an entity in a document store and a relational DB simultaneously)",
      "Highly concurrent write-heavy systems where fine-grained transaction control and optimistic locking need explicit, visible management"
    ],
    "when_not_to_use_zh": [
      "具有深度业务规则且与单张表映射不吻合的复杂领域——带专用领域层的数据映射器更合适",
      "微服务或共享库中，将数据库耦合的对象跨服务边界暴露会引入不必要的耦合",
      "需要多种持久化后端的应用（如将实体的不同部分同时存储在文档存储和关系型数据库中）",
      "高并发写入密集型系统，需要对事务控制和乐观锁进行精细、可见的管理"
    ],
    "adopters": [
      "Ruby on Rails",
      "Laravel (Eloquent)",
      "Django ORM",
      "Castle ActiveRecord",
      "Sequelize"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "usability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Fowler, M. (2002). \"Patterns of Enterprise Application Architecture\". Addison-Wesley. pp. 160–164.",
    "secondary_sources": [
      "Hansson, D.H. (2004). \"Ruby on Rails\". rails presentation and documentation, rubyonrails.org.",
      "Otwell, T. (2011). \"Laravel: Eloquent ORM\". laravel.com/docs/eloquent.",
      "Fowler, M. (2014). \"ActiveRecord\". martinfowler.com/eaaCatalog/activeRecord.html."
    ],
    "typed_relations": [
      {
        "slug": "repository-pattern",
        "type": "alternative"
      },
      {
        "slug": "data-mapper-pattern",
        "type": "alternative"
      },
      {
        "slug": "data-transfer-object",
        "type": "complement"
      }
    ]
  },
  {
    "id": 44,
    "name": "Test Pyramid",
    "name_zh": "测试金字塔",
    "slug": "test-pyramid",
    "category": "quality",
    "desc": "Balance unit, integration, and E2E tests by cost and speed",
    "desc_zh": "按成本与速度平衡单元测试、集成测试和端到端测试",
    "steps": [
      "Build a wide base of unit tests: fast, isolated tests covering business logic, pure functions, and edge cases with high coverage",
      "Add a middle layer of integration tests: verify that modules, services, and databases work together correctly at boundaries",
      "Write a thin top layer of E2E tests: cover critical user journeys only, accepting slower execution and higher maintenance cost",
      "Enforce the pyramid shape in CI: set coverage gates per layer and fail builds when the ratio inverts (too many E2E, too few unit)",
      "Review and rebalance quarterly: analyze flaky tests, slow suites, and coverage gaps to maintain the pyramid's health over time"
    ],
    "steps_zh": [
      "构建宽广的单元测试基础：快速、隔离的测试覆盖业务逻辑、纯函数和边界情况",
      "添加中间层集成测试：验证模块、服务和数据库在边界处正确协作",
      "编写薄顶层端到端测试：仅覆盖关键用户旅程，接受较慢的执行速度和较高的维护成本",
      "在CI中强制金字塔形状：为每层设置覆盖率门槛，比例倒置时构建失败",
      "季度审查与再平衡：分析不稳定测试、慢速套件和覆盖盲区，维持金字塔长期健康"
    ],
    "ai_relevant": true,
    "viz_type": "pyramid",
    "viz_labels": [
      "Unit Tests",
      "Integration",
      "E2E"
    ],
    "viz_labels_zh": [
      "单元测试",
      "集成测试",
      "端到端"
    ],
    "related": [
      "testing-trophy",
      "tdd",
      "property-based-testing"
    ],
    "tags": [
      "testing",
      "test-pyramid",
      "unit-tests",
      "integration-tests",
      "e2e"
    ],
    "origin_author": "Mike Cohn, 2009",
    "origin_source": "Succeeding with Agile: Software Development Using Scrum",
    "origin_source_zh": "《Scrum敏捷软件开发》",
    "complexity": "beginner",
    "when_to_use": [
      "Designing a test strategy for a new project or service from scratch",
      "Auditing an existing test suite that has become slow or flaky",
      "Onboarding a team to testing best practices with a simple mental model",
      "Balancing CI pipeline speed against test coverage confidence"
    ],
    "when_to_use_zh": [
      "为新项目或服务从零开始设计测试策略",
      "审计已变得缓慢或不稳定的现有测试套件",
      "用简单的心智模型引导团队学习测试最佳实践",
      "在CI流水线速度和测试覆盖信心之间取得平衡"
    ],
    "core_concepts": [
      "Unit Tests: Fast, isolated tests of individual functions or classes with no external dependencies",
      "Integration Tests: Tests verifying that multiple components, services, or databases work correctly together",
      "End-to-End Tests: Full-stack tests simulating real user journeys through the entire system",
      "Pyramid Shape: More unit tests at the base, fewer integration in the middle, fewest E2E at the top",
      "Cost-Speed Tradeoff: Higher-level tests provide more confidence but are slower, more expensive, and more brittle"
    ],
    "core_concepts_zh": [
      "单元测试：对单个函数或类进行快速、隔离的测试，不依赖外部资源",
      "集成测试：验证多个组件、服务或数据库正确协作的测试",
      "端到端测试：模拟真实用户旅程、贯穿整个系统的全栈测试",
      "金字塔形状：底部大量单元测试，中间较少集成测试，顶部最少端到端测试",
      "成本-速度权衡：层级越高的测试提供更高信心，但更慢、更贵且更脆弱"
    ],
    "timeline": [
      [
        "2003",
        "Mike Cohn begins advocating layered test strategies in agile teams"
      ],
      [
        "2009",
        "Cohn publishes the Test Pyramid in 'Succeeding with Agile'"
      ],
      [
        "2012",
        "Martin Fowler popularizes the concept with his influential blog post 'TestPyramid'"
      ],
      [
        "2018",
        "Ham Vocke writes the comprehensive 'Practical Test Pyramid' guide on martinfowler.com"
      ],
      [
        "2020",
        "Test Pyramid becomes the de facto mental model taught in most testing courses"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "Mike Cohn开始在敏捷团队中倡导分层测试策略"
      ],
      [
        "2009",
        "Cohn在《Scrum敏捷软件开发》中发表测试金字塔"
      ],
      [
        "2012",
        "Martin Fowler通过其影响力博客文章'TestPyramid'普及了这一概念"
      ],
      [
        "2018",
        "Ham Vocke在martinfowler.com上撰写了全面的「实用测试金字塔」指南"
      ],
      [
        "2020",
        "测试金字塔成为大多数测试课程中教授的事实标准心智模型"
      ]
    ],
    "dos": [
      "Do keep unit tests fast (under 10ms each) because slow unit tests discourage frequent runs",
      "Do isolate layers with test doubles so each level validates its own concerns independently",
      "Do track the ratio of tests per layer because an inverted pyramid signals structural problems",
      "Do invest in deterministic tests because flaky tests erode team confidence in the entire suite"
    ],
    "dos_zh": [
      "保持单元测试的快速执行（每个不超过10毫秒），因为慢速单元测试会阻碍频繁运行",
      "用测试替身隔离各层，使每个层级独立验证自身关注点",
      "追踪每层的测试数量比例，因为倒金字塔意味着结构性问题",
      "投入精力确保测试的确定性，因为不稳定测试会侵蚀团队对整个套件的信心"
    ],
    "donts": [
      "Don't write E2E tests for every feature because they are slow, brittle, and expensive to maintain",
      "Don't mock everything in integration tests because over-mocking hides real integration bugs",
      "Don't ignore flaky tests because they train teams to dismiss legitimate failures",
      "Don't treat the pyramid as a rigid rule because some architectures need a different shape"
    ],
    "donts_zh": [
      "不要为每个功能都编写端到端测试，因为它们缓慢、脆弱且维护成本高",
      "不要在集成测试中过度使用Mock，因为过度Mock会隐藏真实的集成缺陷",
      "不要忽视不稳定测试，因为它们会训练团队习惯性地忽略合理的失败",
      "不要将金字塔视为死板规则，因为某些架构需要不同的形状"
    ],
    "case_study_company": "Google",
    "case_study": "Google's engineering teams maintain a roughly 70/20/10 split between unit, integration, and E2E tests across their monorepo. This ratio keeps their CI pipeline fast while still catching integration issues. When teams deviated toward too many E2E tests, build times spiked and flaky test rates increased, prompting internal tooling to enforce pyramid ratios.",
    "case_study_zh": "Google的工程团队在其单一代码仓库中大致维持着70/20/10的单元测试、集成测试和端到端测试比例。这一比例保证了CI流水线的速度，同时仍能捕获集成问题。当团队偏向过多的端到端测试时，构建时间激增、不稳定测试率上升，促使内部工具开始强制执行金字塔比例。",
    "when_not_to_use": [
      "Highly UI-driven applications where visual correctness matters more than logic coverage",
      "Simple CRUD apps with minimal business logic where integration tests provide more value",
      "Exploratory prototyping phases where tests would be immediately rewritten"
    ],
    "when_not_to_use_zh": [
      "以UI为核心的应用，视觉正确性比逻辑覆盖更重要",
      "业务逻辑极少的简单CRUD应用，集成测试能提供更多价值",
      "探索性原型阶段，测试很快就会被重写"
    ],
    "adopters": [
      "Google",
      "Microsoft",
      "Spotify",
      "Atlassian",
      "ThoughtWorks"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "testability",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Mike Cohn (2009). \"Succeeding with Agile: Software Development Using Scrum\". Addison-Wesley.",
    "secondary_sources": [
      "Martin Fowler (2012). \"TestPyramid\". martinfowler.com.",
      "Ham Vocke (2018). \"The Practical Test Pyramid\". martinfowler.com."
    ],
    "typed_relations": [
      {
        "slug": "testing-trophy",
        "type": "alternative"
      },
      {
        "slug": "tdd",
        "type": "complement"
      },
      {
        "slug": "property-based-testing",
        "type": "complement"
      }
    ]
  },
  {
    "id": 45,
    "name": "Testing Trophy",
    "name_zh": "测试奖杯模型",
    "slug": "testing-trophy",
    "category": "quality",
    "desc": "Emphasize integration tests as the highest-ROI testing layer",
    "desc_zh": "强调集成测试作为投资回报率最高的测试层级",
    "steps": [
      "Start with static analysis: use TypeScript, ESLint, and formatters as the wide base of the trophy to catch errors before runtime",
      "Invest heavily in integration tests: test real component interactions, API calls, and database queries — the trophy's widest bulge",
      "Write focused unit tests: cover complex algorithms and pure utility functions, but don't unit-test simple wiring or framework glue",
      "Add a small cap of E2E tests: cover only the most critical happy paths and smoke tests with tools like Playwright or Cypress",
      "Optimize for confidence per test dollar: measure defect escape rate per layer and shift investment toward the layer catching the most bugs"
    ],
    "steps_zh": [
      "从静态分析开始：使用TypeScript、ESLint和格式化工具作为奖杯的宽基座，在运行前捕获错误",
      "大量投入集成测试：测试真实的组件交互、API调用和数据库查询——奖杯最宽的部分",
      "编写聚焦的单元测试：覆盖复杂算法和纯工具函数，但不对简单装配或框架胶水做单元测试",
      "添加少量端到端测试：仅用Playwright或Cypress覆盖最关键的正向路径和冒烟测试",
      "优化每测试美元的置信度：衡量每层的缺陷逃逸率，将投入转向捕获最多bug的层级"
    ],
    "ai_relevant": false,
    "viz_type": "pyramid",
    "viz_labels": [
      "Static",
      "Unit",
      "Integration",
      "E2E"
    ],
    "viz_labels_zh": [
      "静态分析",
      "单元测试",
      "集成测试",
      "端到端"
    ],
    "related": [
      "test-pyramid",
      "tdd",
      "contract-testing"
    ],
    "tags": [
      "testing",
      "integration-tests",
      "static-analysis",
      "test-strategy"
    ],
    "origin_author": "Kent C. Dodds, 2018",
    "origin_source": "The Testing Trophy and Testing Classifications (blog post)",
    "origin_source_zh": "《测试奖杯与测试分类》（博客文章）",
    "complexity": "intermediate",
    "when_to_use": [
      "Building modern JavaScript/TypeScript web applications with component-based frameworks",
      "Teams already using static typing and linting who want to optimize their test investment",
      "Projects where integration bugs cause more production incidents than unit-level logic errors",
      "Frontend-heavy applications with significant UI interaction logic"
    ],
    "when_to_use_zh": [
      "使用基于组件的框架构建现代JavaScript/TypeScript Web应用",
      "已使用静态类型和代码检查、希望优化测试投入的团队",
      "集成缺陷比单元级逻辑错误导致更多生产事故的项目",
      "有大量UI交互逻辑的前端密集型应用"
    ],
    "core_concepts": [
      "Static Analysis: Type checkers and linters as the zero-cost base layer catching bugs at compile time",
      "Integration Tests: The highest-ROI layer testing real interactions between components and services",
      "Confidence Coefficient: The idea that each test should maximize confidence per dollar of maintenance cost",
      "Testing Classification: Distinguishing tests by the type of confidence they provide, not just scope",
      "Write Tests Not Too Many Mostly Integration: Kent C. Dodds's guiding principle summarizing the approach"
    ],
    "core_concepts_zh": [
      "静态分析：类型检查器和代码检查工具作为零成本基础层，在编译时捕获缺陷",
      "集成测试：投资回报率最高的测试层，测试组件和服务之间的真实交互",
      "信心系数：每个测试应最大化每单位维护成本所带来的信心",
      "测试分类：按测试提供的信心类型区分测试，而不仅仅是范围",
      "编写测试，不要太多，主要写集成测试：Kent C. Dodds总结该方法的指导原则"
    ],
    "timeline": [
      [
        "2017",
        "Kent C. Dodds develops the Testing Library family (DOM Testing Library, React Testing Library)"
      ],
      [
        "2018",
        "Dodds publishes 'The Testing Trophy and Testing Classifications' blog post coining the term"
      ],
      [
        "2019",
        "React Testing Library overtakes Enzyme as the most popular React testing tool"
      ],
      [
        "2020",
        "Testing Trophy gains widespread adoption in the frontend community"
      ],
      [
        "2023",
        "The philosophy influences testing approaches across Vue, Svelte, and other framework ecosystems"
      ]
    ],
    "timeline_zh": [
      [
        "2017",
        "Kent C. Dodds开发了Testing Library系列工具（DOM Testing Library、React Testing Library）"
      ],
      [
        "2018",
        "Dodds发表「测试奖杯与测试分类」博客文章，首创该术语"
      ],
      [
        "2019",
        "React Testing Library超越Enzyme成为最流行的React测试工具"
      ],
      [
        "2020",
        "测试奖杯模型在前端社区获得广泛采用"
      ],
      [
        "2023",
        "该理念影响了Vue、Svelte等框架生态系统的测试方法"
      ]
    ],
    "dos": [
      "Do test components the way users interact with them because implementation-detail tests break on refactors",
      "Do invest in static analysis tooling because it catches entire categories of bugs for free",
      "Do write integration tests that span real component boundaries because that is where most bugs live",
      "Do use Testing Library's guiding principles to write tests that resemble how software is used"
    ],
    "dos_zh": [
      "按用户交互方式测试组件，因为测试实现细节会在重构时频繁失败",
      "投资静态分析工具，因为它能免费捕获整类缺陷",
      "编写跨越真实组件边界的集成测试，因为大多数缺陷存在于边界处",
      "使用Testing Library的指导原则编写模拟真实软件使用方式的测试"
    ],
    "donts": [
      "Don't write excessive unit tests for simple component wiring because they add maintenance cost without proportional confidence",
      "Don't test implementation details because such tests break on every refactor without catching real bugs",
      "Don't skip static analysis because it is the cheapest and fastest bug-catching layer available",
      "Don't rely solely on E2E tests because they are too slow and flaky for broad coverage"
    ],
    "donts_zh": [
      "不要为简单的组件组装编写过多单元测试，因为增加了维护成本却没有相应的信心提升",
      "不要测试实现细节，因为这类测试在每次重构时都会失败，却捕获不到真实缺陷",
      "不要跳过静态分析，因为它是最廉价、最快速的缺陷捕获层",
      "不要仅依赖端到端测试，因为它们太慢且不稳定，无法提供广泛覆盖"
    ],
    "case_study_company": "PayPal",
    "case_study": "At PayPal, Kent C. Dodds championed the Testing Trophy approach during his tenure, shifting the frontend team from heavy Enzyme unit tests to React Testing Library integration tests. This reduced test maintenance burden by roughly 40% while improving defect detection rates, as integration tests caught more cross-component bugs that unit tests had missed.",
    "case_study_zh": "在PayPal任职期间，Kent C. Dodds推广了测试奖杯方法，将前端团队从大量Enzyme单元测试转向React Testing Library集成测试。这将测试维护负担降低了约40%，同时提高了缺陷检测率，因为集成测试捕获了更多单元测试遗漏的跨组件缺陷。",
    "when_not_to_use": [
      "Backend systems with complex algorithmic logic where unit tests are more valuable than integration tests",
      "Projects without static typing where the base layer of the trophy is effectively missing",
      "Systems with expensive integration test environments where unit tests are more cost-effective"
    ],
    "when_not_to_use_zh": [
      "拥有复杂算法逻辑的后端系统，单元测试比集成测试更有价值",
      "没有静态类型的项目，奖杯的基础层实际上是缺失的",
      "集成测试环境成本高昂的系统，单元测试更具成本效益"
    ],
    "adopters": [
      "PayPal",
      "Shopify",
      "Vercel",
      "Netflix",
      "GitHub"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "testability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Kent C. Dodds (2018). \"The Testing Trophy and Testing Classifications\". kentcdodds.com.",
    "secondary_sources": [
      "Kent C. Dodds (2019). \"Write Tests. Not Too Many. Mostly Integration.\". kentcdodds.com.",
      "Guillermo Rauch (2016). \"Write Tests\" (original tweet inspiring the concept). Twitter."
    ],
    "typed_relations": [
      {
        "slug": "test-pyramid",
        "type": "alternative"
      },
      {
        "slug": "tdd",
        "type": "complement"
      },
      {
        "slug": "contract-testing",
        "type": "complement"
      }
    ]
  },
  {
    "id": 46,
    "name": "Test-Driven Development (TDD)",
    "name_zh": "测试驱动开发",
    "slug": "tdd",
    "category": "quality",
    "desc": "Write failing tests first, then code to pass, then refactor",
    "desc_zh": "先写失败测试，再编写代码使之通过，最后重构",
    "steps": [
      "Red: write a small, focused test that describes the next behavior increment; run it and confirm it fails for the right reason",
      "Green: write the minimum amount of production code needed to make the failing test pass — no more, no less",
      "Refactor: improve the code's structure, naming, and duplication while keeping all tests green; apply clean code principles",
      "Repeat the cycle: pick the next simplest behavior, write a failing test, make it pass, and refactor — in cycles of 1-5 minutes",
      "Build a regression suite: as TDD cycles accumulate, the test suite becomes a living specification and safety net for future changes"
    ],
    "steps_zh": [
      "红灯：编写一个小而专注的测试描述下一个行为增量；运行并确认它因正确的原因失败",
      "绿灯：编写刚好使失败测试通过所需的最少生产代码——不多不少",
      "重构：在保持所有测试通过的前提下改善代码结构、命名和重复；应用整洁代码原则",
      "重复循环：选择下一个最简单的行为，编写失败测试，使其通过并重构——以1-5分钟为周期",
      "构建回归套件：随TDD周期累积，测试套件成为活的规格说明和未来变更的安全网"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Red",
      "Green",
      "Refactor"
    ],
    "viz_labels_zh": [
      "红灯",
      "绿灯",
      "重构"
    ],
    "related": [
      "test-pyramid",
      "bdd",
      "clean-code-principles"
    ],
    "tags": [
      "tdd",
      "red-green-refactor",
      "test-first",
      "design",
      "feedback"
    ],
    "origin_author": "Kent Beck, 1999",
    "origin_source": "Extreme Programming Explained: Embrace Change",
    "origin_source_zh": "《解析极限编程：拥抱变化》",
    "complexity": "intermediate",
    "when_to_use": [
      "Implementing well-understood business logic where requirements can be expressed as clear assertions",
      "Refactoring legacy code where you need a safety net of tests before making changes",
      "Developing library or API code where correctness and contracts are paramount",
      "Coaching developers on incremental design and YAGNI thinking"
    ],
    "when_to_use_zh": [
      "实现需求明确的业务逻辑，能用清晰的断言表达",
      "重构遗留代码，在修改前需要测试安全网",
      "开发正确性和契约至关重要的库或API代码",
      "指导开发者学习增量设计和YAGNI思维"
    ],
    "core_concepts": [
      "Red-Green-Refactor: The three-step cycle of writing a failing test, making it pass, then improving the code",
      "Baby Steps: Making the smallest possible increment in each cycle to maintain continuous progress",
      "Emergent Design: Letting the architecture emerge from the refactoring step rather than up-front big design",
      "YAGNI: Only writing code needed to pass the current test, avoiding speculative features",
      "Triangulation: Using multiple test cases to drive toward a general solution instead of hard-coding"
    ],
    "core_concepts_zh": [
      "红-绿-重构：编写失败测试、使其通过、改善代码的三步循环",
      "小步前进：每个循环中做最小可能的增量，保持持续进展",
      "浮现式设计：让架构从重构步骤中自然浮现，而非前期大规模设计",
      "YAGNI：只编写使当前测试通过所需的代码，避免推测性功能",
      "三角测量：使用多个测试用例驱动通用解决方案，而非硬编码"
    ],
    "timeline": [
      [
        "1999",
        "Kent Beck introduces TDD as a core practice of Extreme Programming"
      ],
      [
        "2002",
        "Beck publishes 'Test-Driven Development: By Example' as a standalone guide"
      ],
      [
        "2005",
        "TDD becomes a mainstream practice adopted by agile teams worldwide"
      ],
      [
        "2014",
        "David Heinemeier Hansson sparks the 'TDD is Dead' debate, leading to deeper community discussion"
      ],
      [
        "2020",
        "TDD experiences resurgence with AI-assisted coding, as tests serve as specifications for code generation"
      ]
    ],
    "timeline_zh": [
      [
        "1999",
        "Kent Beck将TDD作为极限编程的核心实践引入"
      ],
      [
        "2002",
        "Beck出版《测试驱动开发：实战与模式解析》作为独立指南"
      ],
      [
        "2005",
        "TDD成为全球敏捷团队广泛采用的主流实践"
      ],
      [
        "2014",
        "David Heinemeier Hansson发起'TDD已死'辩论，引发社区深入讨论"
      ],
      [
        "2020",
        "随着AI辅助编码兴起，TDD迎来复兴，测试充当代码生成的规格说明"
      ]
    ],
    "dos": [
      "Do write the test before the production code because the test-first discipline drives better design",
      "Do keep cycles short (1-5 minutes) because long cycles lose the feedback benefit of TDD",
      "Do refactor aggressively in the green phase because skipping refactoring leads to test-passing spaghetti",
      "Do test behavior not implementation because behavior tests survive refactoring"
    ],
    "dos_zh": [
      "在生产代码之前编写测试，因为测试优先的纪律能驱动更好的设计",
      "保持短周期（1-5分钟），因为过长的周期会失去TDD的反馈优势",
      "在绿灯阶段积极重构，因为跳过重构会导致虽然通过测试但代码混乱",
      "测试行为而非实现，因为行为测试能在重构中存活"
    ],
    "donts": [
      "Don't write multiple failing tests at once because it breaks the one-step-at-a-time discipline",
      "Don't skip the refactor step because accumulated technical debt defeats the purpose of TDD",
      "Don't test private methods directly because it couples tests to implementation details",
      "Don't pursue 100% coverage as a goal because it leads to low-value tests and false confidence"
    ],
    "donts_zh": [
      "不要同时编写多个失败测试，因为这破坏了一步一步前进的纪律",
      "不要跳过重构步骤，因为累积的技术债会违背TDD的初衷",
      "不要直接测试私有方法，因为这将测试与实现细节耦合",
      "不要把100%覆盖率作为目标，因为这会导致低价值测试和虚假信心"
    ],
    "case_study_company": "Pivotal Labs",
    "case_study": "Pivotal Labs (now part of VMware Tanzu) built their entire consulting practice around TDD and pair programming. Their client engagements consistently showed that TDD-practiced teams delivered code with 40-60% fewer production defects compared to non-TDD teams. The discipline also reduced debugging time significantly, as most bugs were caught within minutes of introduction.",
    "case_study_zh": "Pivotal Labs（现为VMware Tanzu的一部分）围绕TDD和结对编程建立了整个咨询实践。他们的客户项目一致表明，采用TDD的团队交付的代码比未采用TDD的团队减少了40-60%的生产缺陷。该方法还显著减少了调试时间，因为大多数缺陷在引入后几分钟内就被捕获。",
    "when_not_to_use": [
      "Rapid prototyping where requirements are unknown and code will be thrown away",
      "Exploratory UI design where visual outcomes are subjective and hard to assert",
      "Highly concurrent or distributed systems where deterministic unit testing is difficult"
    ],
    "when_not_to_use_zh": [
      "需求不明确且代码将被丢弃的快速原型开发",
      "视觉效果主观且难以断言的探索性UI设计",
      "确定性单元测试困难的高并发或分布式系统"
    ],
    "adopters": [
      "Pivotal Labs",
      "ThoughtWorks",
      "8th Light",
      "Industrial Logic",
      "Spotify"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "testability",
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Kent Beck (2002). \"Test-Driven Development: By Example\". Addison-Wesley.",
    "secondary_sources": [
      "Kent Beck (1999). \"Extreme Programming Explained: Embrace Change\". Addison-Wesley.",
      "Robert C. Martin (2007). \"Professionalism and Test-Driven Development\". IEEE Software, 24(3)."
    ],
    "typed_relations": [
      {
        "slug": "test-pyramid",
        "type": "complement"
      },
      {
        "slug": "bdd",
        "type": "complement"
      },
      {
        "slug": "clean-code-principles",
        "type": "complement"
      }
    ]
  },
  {
    "id": 47,
    "name": "Behavior-Driven Development (BDD)",
    "name_zh": "行为驱动开发",
    "slug": "bdd",
    "category": "quality",
    "desc": "Specify behavior in Given-When-Then shared by all stakeholders",
    "desc_zh": "用 Given-When-Then 格式描述行为，所有干系人共享理解",
    "steps": [
      "Discovery workshop: collaborate with product, QA, and dev to discover scenarios using Example Mapping (rules, examples, questions)",
      "Write scenarios in Gherkin: express each behavior as Given [context], When [action], Then [outcome] in a .feature file",
      "Automate step definitions: implement the Given/When/Then steps in code that drives the system under test (Cucumber, SpecFlow, Behave)",
      "Run as living documentation: execute BDD scenarios in CI and publish results as human-readable reports that serve as up-to-date specs",
      "Refine continuously: review scenarios in sprint planning, prune obsolete ones, and add new scenarios for newly discovered edge cases"
    ],
    "steps_zh": [
      "发现工作坊：与产品、QA和开发协作，使用示例映射（规则、示例、问题）发现场景",
      "用Gherkin编写场景：将每个行为表达为Given [上下文]、When [动作]、Then [结果]，保存在.feature文件中",
      "自动化步骤定义：用代码实现Given/When/Then步骤以驱动被测系统（Cucumber、SpecFlow、Behave）",
      "作为活文档运行：在CI中执行BDD场景并发布人类可读的报告，作为始终最新的规格说明",
      "持续精化：在迭代计划中评审场景，修剪过时场景，为新发现的边界情况添加新场景"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Given",
      "When",
      "Then"
    ],
    "viz_labels_zh": [
      "前置条件",
      "触发动作",
      "预期结果"
    ],
    "related": [
      "tdd",
      "test-pyramid",
      "domain-driven-design"
    ],
    "tags": [
      "bdd",
      "gherkin",
      "given-when-then",
      "living-documentation",
      "collaboration"
    ],
    "origin_author": "Dan North, 2006",
    "origin_source": "Introducing BDD (blog post on dannorth.net)",
    "origin_source_zh": "《BDD介绍》（dannorth.net博客文章）",
    "complexity": "intermediate",
    "when_to_use": [
      "Projects where miscommunication between business and development causes frequent rework",
      "Domain-heavy applications where business rules need executable documentation",
      "Teams with non-technical stakeholders who need to read and validate test specifications",
      "Regulated industries requiring traceable specification-to-test coverage"
    ],
    "when_to_use_zh": [
      "业务与开发之间的沟通不畅导致频繁返工的项目",
      "业务规则需要可执行文档的领域密集型应用",
      "有非技术干系人需要阅读和验证测试规格的团队",
      "需要可追溯的规格到测试覆盖的受监管行业"
    ],
    "core_concepts": [
      "Given-When-Then: A structured format for expressing scenarios as context, action, and expected outcome",
      "Gherkin: A domain-specific language for writing human-readable executable specifications",
      "Example Mapping: A collaborative workshop technique to discover rules, examples, and open questions",
      "Living Documentation: Automated scenarios that serve as always-up-to-date system documentation",
      "Ubiquitous Language: Using the same domain terms in scenarios, code, and conversations with stakeholders"
    ],
    "core_concepts_zh": [
      "Given-When-Then：用上下文、动作和预期结果表达场景的结构化格式",
      "Gherkin：用于编写人类可读的可执行规格的领域特定语言",
      "示例映射：发现规则、示例和开放问题的协作工作坊技术",
      "活文档：作为始终最新系统文档的自动化场景",
      "通用语言：在场景、代码和干系人对话中使用相同的领域术语"
    ],
    "timeline": [
      [
        "2003",
        "Dan North begins exploring alternatives to TDD terminology to improve communication"
      ],
      [
        "2006",
        "North publishes 'Introducing BDD' establishing the Given-When-Then format"
      ],
      [
        "2008",
        "Cucumber is released by Aslak Hellesoy, making BDD automation practical in Ruby"
      ],
      [
        "2012",
        "SpecFlow brings BDD to .NET; Behave brings it to Python; BDD goes cross-platform"
      ],
      [
        "2015",
        "Example Mapping technique by Matt Wynne formalizes the BDD discovery workshop process"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "Dan North开始探索TDD术语的替代方案以改善沟通"
      ],
      [
        "2006",
        "North发表「BDD介绍」，确立Given-When-Then格式"
      ],
      [
        "2008",
        "Aslak Hellesoy发布Cucumber，使BDD自动化在Ruby中变得实用"
      ],
      [
        "2012",
        "SpecFlow将BDD带入.NET；Behave将其带入Python；BDD走向跨平台"
      ],
      [
        "2015",
        "Matt Wynne的示例映射技术正式化了BDD发现工作坊流程"
      ]
    ],
    "dos": [
      "Do involve the whole team in writing scenarios because BDD is a collaboration tool, not just a testing tool",
      "Do keep scenarios at the business behavior level because technical details belong in step definitions",
      "Do use Example Mapping before writing Gherkin because it prevents writing scenarios for unclear requirements",
      "Do prune outdated scenarios regularly because stale scenarios undermine trust in living documentation"
    ],
    "dos_zh": [
      "让整个团队参与编写场景，因为BDD是协作工具而不仅是测试工具",
      "保持场景在业务行为层面，因为技术细节属于步骤定义",
      "在编写Gherkin之前使用示例映射，因为这能防止为不清晰的需求编写场景",
      "定期修剪过时场景，因为陈旧场景会破坏对活文档的信任"
    ],
    "donts": [
      "Don't write scenarios in isolation without business stakeholders because it defeats the collaboration purpose",
      "Don't include UI selectors or technical details in Gherkin because it makes scenarios brittle and unreadable",
      "Don't treat BDD as merely a test automation framework because the real value is in the discovery conversations",
      "Don't write too many scenarios per feature because it creates a maintenance burden that slows development"
    ],
    "donts_zh": [
      "不要在没有业务干系人参与的情况下孤立编写场景，因为这违背了协作的目的",
      "不要在Gherkin中包含UI选择器或技术细节，因为这使场景脆弱且难以阅读",
      "不要将BDD仅视为测试自动化框架，因为真正的价值在于发现对话",
      "不要为每个功能编写过多场景，因为这会产生拖慢开发的维护负担"
    ],
    "case_study_company": "GOV.UK",
    "case_study": "The UK Government Digital Service (GDS) adopted BDD with Cucumber for building GOV.UK services. The Given-When-Then scenarios served as a shared language between policy teams and developers, ensuring that government services correctly implemented complex regulatory requirements. This approach reduced requirement misunderstandings by over 50% in their first year of adoption.",
    "case_study_zh": "英国政府数字服务部（GDS）在构建GOV.UK服务时采用了BDD与Cucumber。Given-When-Then场景作为政策团队和开发人员之间的共享语言，确保政府服务正确实现了复杂的监管要求。这一方法在采用的第一年将需求误解减少了50%以上。",
    "when_not_to_use": [
      "Small teams where developers and stakeholders communicate directly and frequently",
      "Purely technical infrastructure projects without business-facing behavior",
      "Rapid prototyping phases where specifications change too fast for Gherkin to keep up"
    ],
    "when_not_to_use_zh": [
      "开发人员与干系人直接频繁沟通的小型团队",
      "没有面向业务行为的纯技术基础设施项目",
      "规格变化太快以至于Gherkin跟不上的快速原型阶段"
    ],
    "adopters": [
      "GOV.UK",
      "BBC",
      "Booking.com",
      "Ryanair",
      "Commonwealth Bank of Australia"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "testability",
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Dan North (2006). \"Introducing BDD\". dannorth.net.",
    "secondary_sources": [
      "Matt Wynne and Aslak Hellesoy (2012). \"The Cucumber Book: Behaviour-Driven Development for Testers and Developers\". Pragmatic Bookshelf.",
      "Gojko Adzic (2011). \"Specification by Example: How Successful Teams Deliver the Right Software\". Manning Publications."
    ],
    "typed_relations": [
      {
        "slug": "tdd",
        "type": "complement"
      },
      {
        "slug": "test-pyramid",
        "type": "complement"
      },
      {
        "slug": "domain-driven-design",
        "type": "complement"
      }
    ]
  },
  {
    "id": 48,
    "name": "Property-Based Testing",
    "name_zh": "基于属性的测试",
    "slug": "property-based-testing",
    "category": "quality",
    "desc": "Test invariant properties with auto-generated random inputs",
    "desc_zh": "使用自动生成的随机输入测试不变性质",
    "steps": [
      "Identify properties: define universal truths about your code (e.g., encode then decode returns original, sort output is always ordered)",
      "Write generators: create or compose data generators that produce valid random inputs covering edge cases (empty, huge, special chars)",
      "Express the property: write a test that asserts the property holds for all generated inputs using a PBT library (fast-check, Hypothesis, QuickCheck)",
      "Analyze shrunk failures: when a property fails, the framework automatically shrinks the input to the minimal failing case for easy debugging",
      "Combine with example tests: use PBT for invariants and boundary exploration; keep example-based tests for specific documented behaviors"
    ],
    "steps_zh": [
      "识别属性：定义代码的普遍真理（如编码后解码返回原始值，排序输出总是有序的）",
      "编写生成器：创建或组合数据生成器，产生覆盖边界情况的有效随机输入（空值、极大值、特殊字符）",
      "表达属性：使用PBT库（fast-check、Hypothesis、QuickCheck）编写断言属性对所有生成输入成立的测试",
      "分析收缩的失败用例：属性失败时，框架自动将输入收缩至最小失败用例以便调试",
      "与示例测试结合：对不变式和边界探索使用PBT；对特定文档化行为保留基于示例的测试"
    ],
    "ai_relevant": true,
    "viz_type": "radar",
    "viz_labels": [
      "Properties",
      "Generators",
      "Shrinking",
      "Coverage",
      "Invariants"
    ],
    "viz_labels_zh": [
      "属性定义",
      "生成器",
      "收缩",
      "覆盖率",
      "不变量"
    ],
    "related": [
      "tdd",
      "design-by-contract",
      "functional-core-imperative-shell"
    ],
    "tags": [
      "property-testing",
      "fuzzing",
      "generators",
      "invariants",
      "quickcheck"
    ],
    "origin_author": "Koen Claessen & John Hughes, 2000",
    "origin_source": "QuickCheck: A Lightweight Tool for Random Testing of Haskell Programs (ICFP 2000)",
    "origin_source_zh": "《QuickCheck：一种轻量级的Haskell程序随机测试工具》（ICFP 2000）",
    "complexity": "advanced",
    "when_to_use": [
      "Testing serialization/deserialization round-trips where encode-decode must be lossless",
      "Validating mathematical or algorithmic invariants (sorting, parsing, compression)",
      "Exploring edge cases that manual test writers would never think of",
      "Testing data pipeline transformations where properties like idempotence or commutativity must hold"
    ],
    "when_to_use_zh": [
      "测试序列化/反序列化往返，编码-解码必须无损",
      "验证数学或算法不变式（排序、解析、压缩）",
      "探索手工测试编写者永远不会想到的边界情况",
      "测试数据管道转换，其中幂等性或交换律等属性必须成立"
    ],
    "core_concepts": [
      "Property: A universally quantified statement that must hold true for all valid inputs",
      "Generator: A composable function that produces random values of a given type for test inputs",
      "Shrinking: Automatic reduction of a failing input to the minimal case that still fails",
      "Arbitrary: A typeclass or interface that combines a generator with a shrinker for a type",
      "Counterexample: A specific input found by the framework that violates the stated property"
    ],
    "core_concepts_zh": [
      "属性：对所有有效输入必须成立的全称量化陈述",
      "生成器：产生给定类型随机值作为测试输入的可组合函数",
      "收缩：将失败输入自动缩减为仍然失败的最小用例",
      "Arbitrary：将生成器和收缩器组合的类型类或接口",
      "反例：框架找到的违反所述属性的具体输入"
    ],
    "timeline": [
      [
        "1999",
        "Koen Claessen and John Hughes develop QuickCheck at Chalmers University"
      ],
      [
        "2000",
        "QuickCheck paper published at ICFP, introducing property-based testing to the functional programming community"
      ],
      [
        "2006",
        "Quviq founded by John Hughes to commercialize QuickCheck for Erlang in industrial settings"
      ],
      [
        "2016",
        "Hypothesis for Python by David MacIver brings PBT to mainstream imperative languages"
      ],
      [
        "2018",
        "fast-check for JavaScript/TypeScript gains popularity, making PBT accessible to web developers"
      ]
    ],
    "timeline_zh": [
      [
        "1999",
        "Koen Claessen和John Hughes在查尔姆斯理工大学开发QuickCheck"
      ],
      [
        "2000",
        "QuickCheck论文在ICFP发表，将基于属性的测试引入函数式编程社区"
      ],
      [
        "2006",
        "John Hughes创立Quviq公司，将QuickCheck商业化应用于Erlang工业场景"
      ],
      [
        "2016",
        "David MacIver开发的Python版Hypothesis将PBT带入主流命令式语言"
      ],
      [
        "2018",
        "JavaScript/TypeScript的fast-check库流行，使PBT对Web开发者可及"
      ]
    ],
    "dos": [
      "Do start with simple round-trip properties because they are easy to understand and highly effective",
      "Do compose generators from smaller ones because complex custom generators are error-prone",
      "Do use shrinking to find minimal failing cases because it dramatically speeds up debugging",
      "Do combine PBT with example-based tests because each approach catches different kinds of bugs"
    ],
    "dos_zh": [
      "从简单的往返属性开始，因为它们易于理解且非常有效",
      "从较小的生成器组合构建复杂生成器，因为复杂的自定义生成器容易出错",
      "利用收缩找到最小失败用例，因为这能显著加速调试",
      "将PBT与基于示例的测试结合，因为两种方法捕获不同类型的缺陷"
    ],
    "donts": [
      "Don't write overly complex generators because they become a source of bugs themselves",
      "Don't ignore failing seeds because non-deterministic test failures indicate real property violations",
      "Don't replace all example tests with PBT because specific documented behaviors need explicit examples",
      "Don't test properties that are trivially true because they waste computation without providing confidence"
    ],
    "donts_zh": [
      "不要编写过于复杂的生成器，因为它们本身会成为缺陷来源",
      "不要忽略失败的种子值，因为非确定性测试失败表明存在真实的属性违反",
      "不要用PBT替换所有示例测试，因为特定文档化行为需要明确的示例",
      "不要测试平凡成立的属性，因为这浪费计算而不提供信心"
    ],
    "case_study_company": "Volvo",
    "case_study": "Volvo used Quviq QuickCheck to test the AUTOSAR embedded software standard used in their vehicles. Property-based testing discovered over 200 bugs in the specification and implementations that traditional testing had missed, including critical timing and state machine issues. John Hughes presented this work as evidence of PBT's industrial effectiveness at multiple conferences.",
    "case_study_zh": "沃尔沃使用Quviq QuickCheck测试其车辆中使用的AUTOSAR嵌入式软件标准。基于属性的测试发现了传统测试遗漏的200多个规格和实现缺陷，包括关键的时序和状态机问题。John Hughes在多个会议上展示这项工作，作为PBT工业有效性的证据。",
    "when_not_to_use": [
      "Simple CRUD operations where properties are trivial and example tests suffice",
      "UI layout testing where visual correctness cannot be expressed as algebraic properties",
      "Exploratory prototypes where the properties themselves are not yet understood"
    ],
    "when_not_to_use_zh": [
      "属性平凡且示例测试足够的简单CRUD操作",
      "视觉正确性无法用代数属性表达的UI布局测试",
      "属性本身尚未被理解的探索性原型"
    ],
    "adopters": [
      "Volvo",
      "Ericsson",
      "Spotify",
      "Jet.com",
      "Jane Street"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "testability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Koen Claessen and John Hughes (2000). \"QuickCheck: A Lightweight Tool for Random Testing of Haskell Programs\". ACM ICFP 2000.",
    "secondary_sources": [
      "John Hughes (2007). \"QuickCheck Testing for Fun and Profit\". PADL 2007.",
      "Fred Hebert (2019). \"Property-Based Testing with PropEr, Erlang, and Elixir\". Pragmatic Bookshelf."
    ],
    "typed_relations": [
      {
        "slug": "tdd",
        "type": "complement"
      },
      {
        "slug": "design-by-contract",
        "type": "complement"
      },
      {
        "slug": "functional-core-imperative-shell",
        "type": "complement"
      }
    ]
  },
  {
    "id": 49,
    "name": "Chaos Engineering",
    "name_zh": "混沌工程",
    "slug": "chaos-engineering",
    "category": "quality",
    "desc": "Inject controlled failures to build confidence in system resilience",
    "desc_zh": "注入受控故障以增强对系统韧性的信心",
    "steps": [
      "Define steady state: establish measurable indicators of normal system behavior (latency p99, error rate, throughput) as your baseline",
      "Hypothesize impact: predict what will happen when a specific failure is injected (e.g., 'losing one AZ won't increase error rate above 0.1%')",
      "Design the experiment: choose the failure mode (network partition, pod kill, CPU spike, dependency latency) and the blast radius",
      "Run in production: execute the experiment during business hours with automatic abort conditions if steady state degrades beyond thresholds",
      "Learn and harden: analyze results, fix discovered weaknesses, update runbooks, and repeat experiments to verify fixes"
    ],
    "steps_zh": [
      "定义稳态：建立正常系统行为的可衡量指标（p99延迟、错误率、吞吐量）作为基线",
      "假设影响：预测注入特定故障后的结果（如「丢失一个可用区不会使错误率超过0.1%」）",
      "设计实验：选择故障模式（网络分区、Pod终止、CPU峰值、依赖延迟）和影响范围",
      "在生产环境运行：在业务时间执行实验，设置稳态超出阈值时自动中止的条件",
      "学习与加固：分析结果，修复发现的弱点，更新运维手册，重复实验验证修复效果"
    ],
    "ai_relevant": false,
    "viz_type": "cycle",
    "viz_labels": [
      "Hypothesize",
      "Experiment",
      "Observe",
      "Improve"
    ],
    "viz_labels_zh": [
      "建立假设",
      "注入故障",
      "观测结果",
      "持续改进"
    ],
    "related": [
      "circuit-breaker-pattern",
      "bulkhead-pattern",
      "sli-slo-sla"
    ],
    "tags": [
      "chaos-engineering",
      "resilience",
      "fault-injection",
      "reliability"
    ],
    "origin_author": "Netflix, 2011",
    "origin_source": "Netflix Tech Blog: The Netflix Simian Army",
    "origin_source_zh": "Netflix技术博客：《Netflix猴子军团》",
    "complexity": "advanced",
    "when_to_use": [
      "Distributed systems where failures are inevitable and resilience must be validated proactively",
      "After major architectural changes to verify that new fault-tolerance mechanisms actually work",
      "Before peak traffic events (Black Friday, product launches) to uncover hidden weaknesses",
      "Organizations transitioning from monolith to microservices where failure modes multiply"
    ],
    "when_to_use_zh": [
      "故障不可避免且需要主动验证韧性的分布式系统",
      "重大架构变更后，验证新容错机制是否真正有效",
      "流量高峰事件（黑色星期五、产品发布）前发现隐藏弱点",
      "从单体向微服务过渡、故障模式成倍增加的组织"
    ],
    "core_concepts": [
      "Steady State Hypothesis: A measurable definition of normal system behavior used as the experiment baseline",
      "Blast Radius: The scope of impact of a chaos experiment, starting small and gradually expanding",
      "Abort Conditions: Automated safety mechanisms that halt experiments when impact exceeds acceptable thresholds",
      "Game Days: Scheduled team exercises where chaos experiments are run with the whole team observing and responding",
      "Failure Injection: Deliberately introducing faults like latency, errors, or resource constraints into a running system"
    ],
    "core_concepts_zh": [
      "稳态假设：作为实验基线的正常系统行为可衡量定义",
      "影响范围：混沌实验的影响范围，从小处开始逐步扩大",
      "中止条件：当影响超过可接受阈值时自动停止实验的安全机制",
      "演练日：安排的团队演练，全队观察和响应混沌实验",
      "故障注入：在运行系统中故意引入延迟、错误或资源约束等故障"
    ],
    "timeline": [
      [
        "2010",
        "Netflix creates Chaos Monkey to randomly terminate EC2 instances in production"
      ],
      [
        "2011",
        "Netflix publishes the Simian Army blog post, formalizing chaos engineering practices"
      ],
      [
        "2014",
        "Netflix releases Chaos Monkey as open source and formalizes the discipline internally"
      ],
      [
        "2017",
        "Casey Rosenthal and Nora Jones publish 'Chaos Engineering' (O'Reilly), establishing it as a field"
      ],
      [
        "2019",
        "Gremlin, LitmusChaos, and Chaos Mesh emerge as commercial and open-source chaos platforms"
      ]
    ],
    "timeline_zh": [
      [
        "2010",
        "Netflix创建Chaos Monkey在生产环境中随机终止EC2实例"
      ],
      [
        "2011",
        "Netflix发表猴子军团博客文章，正式化混沌工程实践"
      ],
      [
        "2014",
        "Netflix将Chaos Monkey开源并在内部正式化该学科"
      ],
      [
        "2017",
        "Casey Rosenthal和Nora Jones出版《混沌工程》（O'Reilly），将其确立为一个领域"
      ],
      [
        "2019",
        "Gremlin、LitmusChaos和Chaos Mesh作为商业和开源混沌平台涌现"
      ]
    ],
    "dos": [
      "Do start with small blast radius experiments because production chaos must be safe and controlled",
      "Do define abort conditions before every experiment because uncontrolled chaos is just an outage",
      "Do run experiments during business hours because that is when real failures happen and teams are available",
      "Do share findings widely because chaos experiments create organizational learning about system behavior"
    ],
    "dos_zh": [
      "从小影响范围的实验开始，因为生产混沌必须是安全和受控的",
      "在每次实验前定义中止条件，因为不受控的混沌只是一次故障",
      "在业务时间运行实验，因为那是真实故障发生且团队在场的时候",
      "广泛分享发现，因为混沌实验创造了关于系统行为的组织学习"
    ],
    "donts": [
      "Don't run chaos experiments without monitoring in place because you cannot measure what you cannot observe",
      "Don't start in production without first practicing in staging because premature production chaos risks real outages",
      "Don't chaos-test without stakeholder buy-in because surprise outages destroy trust in the practice",
      "Don't treat chaos engineering as one-time because systems change and new weaknesses emerge continuously"
    ],
    "donts_zh": [
      "不要在没有监控的情况下运行混沌实验，因为你无法衡量无法观察的东西",
      "不要在未先在预发布环境实践的情况下直接在生产环境实验，因为过早的生产混沌有真实宕机风险",
      "不要在没有干系人支持的情况下进行混沌测试，因为意外宕机会摧毁对该实践的信任",
      "不要将混沌工程视为一次性活动，因为系统不断变化且新弱点持续出现"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix pioneered chaos engineering with Chaos Monkey, which randomly terminated production EC2 instances to force engineers to build resilient services. During the 2011 AWS US-East outage that took down Reddit, Quora, and others, Netflix remained operational because their systems had been hardened through continuous chaos experiments. This event validated the approach and inspired industry-wide adoption.",
    "case_study_zh": "Netflix以Chaos Monkey开创了混沌工程，随机终止生产EC2实例以迫使工程师构建具有韧性的服务。在2011年AWS美东区域故障导致Reddit、Quora等公司宕机时，Netflix因系统经过持续混沌实验加固而保持运营。这一事件验证了该方法并激发了全行业的采用。",
    "when_not_to_use": [
      "Systems without adequate monitoring and observability to measure experiment impact",
      "Early-stage startups where uptime matters more than resilience validation",
      "Monolithic systems with no redundancy where any failure causes total outage"
    ],
    "when_not_to_use_zh": [
      "缺乏足够监控和可观测性来衡量实验影响的系统",
      "正常运行时间比韧性验证更重要的早期创业公司",
      "没有冗余且任何故障都会导致全面宕机的单体系统"
    ],
    "adopters": [
      "Netflix",
      "Amazon",
      "Google",
      "Microsoft",
      "Slack"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Netflix Technology Blog (2011). \"The Netflix Simian Army\". netflixtechblog.com.",
    "secondary_sources": [
      "Casey Rosenthal and Nora Jones (2020). \"Chaos Engineering: System Resiliency in Practice\". O'Reilly Media.",
      "Ali Basiri et al. (2016). \"Chaos Engineering\". IEEE Software, 33(3)."
    ],
    "typed_relations": [
      {
        "slug": "circuit-breaker-pattern",
        "type": "complement"
      },
      {
        "slug": "bulkhead-pattern",
        "type": "complement"
      },
      {
        "slug": "sli-slo-sla",
        "type": "complement"
      }
    ]
  },
  {
    "id": 50,
    "name": "Circuit Breaker Pattern",
    "name_zh": "熔断器模式",
    "slug": "circuit-breaker-pattern",
    "category": "quality",
    "desc": "Prevent cascading failures by short-circuiting failing calls",
    "desc_zh": "通过短路失败调用防止级联故障扩散",
    "steps": [
      "Wrap remote calls: place a circuit breaker around each external dependency call (HTTP, database, third-party API)",
      "Configure thresholds: set failure count or failure rate thresholds that trigger the circuit to open (e.g., 5 failures in 10 seconds)",
      "Implement the three states: Closed (normal), Open (fail-fast with fallback), Half-Open (allow a probe request to test recovery)",
      "Design fallback responses: return cached data, default values, or degraded functionality when the circuit is open",
      "Monitor and alert: track circuit state transitions, open duration, and fallback invocation rates in dashboards and alerting systems"
    ],
    "steps_zh": [
      "包裹远程调用：为每个外部依赖调用（HTTP、数据库、第三方API）放置熔断器",
      "配置阈值：设置触发熔断器打开的失败计数或失败率阈值（如10秒内5次失败）",
      "实现三种状态：关闭（正常）、打开（快速失败并降级）、半开（允许探测请求测试恢复）",
      "设计降级响应：在熔断器打开时返回缓存数据、默认值或降级功能",
      "监控与告警：在仪表盘和告警系统中追踪熔断器状态转换、打开持续时间和降级调用率"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Closed",
      "Open",
      "Half-Open"
    ],
    "viz_labels_zh": [
      "关闭态",
      "断开态",
      "半开态"
    ],
    "related": [
      "bulkhead-pattern",
      "chaos-engineering",
      "saga-pattern"
    ],
    "tags": [
      "circuit-breaker",
      "resilience",
      "fault-tolerance",
      "cascading-failure",
      "fallback"
    ],
    "origin_author": "Michael Nygard, 2007",
    "origin_source": "Release It! Design and Deploy Production-Ready Software",
    "origin_source_zh": "《发布！软件的设计与部署》",
    "complexity": "intermediate",
    "when_to_use": [
      "Microservice architectures where one slow or failing service can cascade to bring down the entire system",
      "Applications calling external third-party APIs with unpredictable reliability",
      "High-traffic systems where waiting on failing dependencies wastes thread pool resources",
      "Services with defined fallback behaviors (caches, defaults) that can serve users during degradation"
    ],
    "when_to_use_zh": [
      "一个慢速或故障服务可能级联导致整个系统崩溃的微服务架构",
      "调用可靠性不可预测的外部第三方API的应用",
      "等待故障依赖会浪费线程池资源的高流量系统",
      "具有可在降级期间服务用户的降级行为（缓存、默认值）的服务"
    ],
    "core_concepts": [
      "Closed State: Normal operation where requests pass through and failures are counted",
      "Open State: The circuit is tripped and all requests immediately fail-fast without calling the dependency",
      "Half-Open State: A limited number of probe requests are allowed to test whether the dependency has recovered",
      "Fallback: An alternative response strategy used when the circuit is open (cached data, defaults, degraded features)",
      "Failure Threshold: The number or rate of failures within a time window that triggers the circuit to open"
    ],
    "core_concepts_zh": [
      "关闭状态：请求正常通过、失败被计数的正常运行状态",
      "打开状态：熔断器跳闸，所有请求立即快速失败而不调用依赖",
      "半开状态：允许有限数量的探测请求测试依赖是否已恢复",
      "降级响应：熔断器打开时使用的替代响应策略（缓存数据、默认值、降级功能）",
      "失败阈值：在时间窗口内触发熔断器打开的失败次数或失败率"
    ],
    "timeline": [
      [
        "2007",
        "Michael Nygard describes the Circuit Breaker pattern in 'Release It!'"
      ],
      [
        "2012",
        "Netflix releases Hystrix, making circuit breakers a standard microservice pattern"
      ],
      [
        "2014",
        "Martin Fowler publishes a detailed circuit breaker blog post popularizing the pattern"
      ],
      [
        "2018",
        "Resilience4j emerges as the lightweight successor to Hystrix for Java applications"
      ],
      [
        "2020",
        "Circuit breakers become built-in features of service mesh platforms like Istio and Linkerd"
      ]
    ],
    "timeline_zh": [
      [
        "2007",
        "Michael Nygard在《发布！》中描述了熔断器模式"
      ],
      [
        "2012",
        "Netflix发布Hystrix，使熔断器成为标准微服务模式"
      ],
      [
        "2014",
        "Martin Fowler发表详细的熔断器博客文章，普及了该模式"
      ],
      [
        "2018",
        "Resilience4j作为Hystrix的轻量级继任者出现，用于Java应用"
      ],
      [
        "2020",
        "熔断器成为Istio和Linkerd等服务网格平台的内置功能"
      ]
    ],
    "dos": [
      "Do configure different thresholds per dependency because each downstream has different failure characteristics",
      "Do implement meaningful fallbacks because a circuit breaker without fallback just changes the error message",
      "Do log circuit state transitions because they are critical signals for operational awareness",
      "Do test circuit breaker behavior with chaos engineering because misconfigured breakers can mask problems"
    ],
    "dos_zh": [
      "为每个依赖配置不同阈值，因为每个下游服务有不同的故障特征",
      "实现有意义的降级响应，因为没有降级的熔断器只是改变了错误消息",
      "记录熔断器状态转换日志，因为它们是运维感知的关键信号",
      "用混沌工程测试熔断器行为，因为配置错误的熔断器可能掩盖问题"
    ],
    "donts": [
      "Don't use a single circuit breaker for all dependencies because one failing service will trip the breaker for all",
      "Don't set thresholds too sensitive because transient network blips will trigger unnecessary circuit opens",
      "Don't forget the half-open state because without it the circuit never recovers automatically",
      "Don't ignore circuit breaker metrics because a circuit that is always open indicates a deeper systemic problem"
    ],
    "donts_zh": [
      "不要为所有依赖使用单一熔断器，因为一个故障服务会触发所有服务的熔断",
      "不要设置过于敏感的阈值，因为瞬时网络抖动会触发不必要的熔断",
      "不要忘记半开状态，因为没有它熔断器永远无法自动恢复",
      "不要忽略熔断器指标，因为始终处于打开状态的熔断器表明存在更深层的系统问题"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix developed Hystrix as their circuit breaker library after experiencing cascading failures in their microservice architecture. When a single recommendation service slowed down, it consumed all available threads in calling services, eventually taking down the entire streaming experience. Hystrix circuit breakers resolved this by fail-fasting degraded calls and returning cached recommendations instead.",
    "case_study_zh": "Netflix在其微服务架构中经历级联故障后开发了Hystrix熔断器库。当单个推荐服务变慢时，它消耗了调用服务的所有可用线程，最终导致整个流媒体体验崩溃。Hystrix熔断器通过快速失败降级调用并返回缓存的推荐内容来解决这一问题。",
    "when_not_to_use": [
      "Monolithic applications with no remote dependencies where failures are handled in-process",
      "Fire-and-forget messaging where failures don't block the caller",
      "Batch processing systems where retries with backoff are more appropriate than fail-fast"
    ],
    "when_not_to_use_zh": [
      "没有远程依赖、故障在进程内处理的单体应用",
      "失败不会阻塞调用者的即发即忘消息系统",
      "使用退避重试比快速失败更合适的批处理系统"
    ],
    "adopters": [
      "Netflix",
      "Amazon",
      "Uber",
      "Alibaba",
      "Capital One"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Michael Nygard (2007). \"Release It! Design and Deploy Production-Ready Software\". Pragmatic Bookshelf.",
    "secondary_sources": [
      "Martin Fowler (2014). \"CircuitBreaker\". martinfowler.com.",
      "Michael Nygard (2018). \"Release It! Design and Deploy Production-Ready Software, 2nd Edition\". Pragmatic Bookshelf."
    ],
    "typed_relations": [
      {
        "slug": "bulkhead-pattern",
        "type": "complement"
      },
      {
        "slug": "chaos-engineering",
        "type": "complement"
      },
      {
        "slug": "saga-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 51,
    "name": "Bulkhead Pattern",
    "name_zh": "舱壁模式",
    "slug": "bulkhead-pattern",
    "category": "quality",
    "desc": "Isolate components so a failure in one doesn't sink the whole system",
    "desc_zh": "隔离组件使某一部分的故障不会导致整个系统崩溃",
    "steps": [
      "Identify failure domains: map all external dependencies and internal components that could fail independently",
      "Partition resources: allocate dedicated thread pools, connection pools, or compute instances per dependency or feature",
      "Set limits per partition: configure max concurrent requests, queue depths, and timeouts for each bulkhead compartment",
      "Implement rejection policies: define what happens when a bulkhead is full — reject immediately, queue with bounded wait, or shed load",
      "Monitor compartment health: track utilization, rejection rates, and queue depth per bulkhead to detect and right-size compartments"
    ],
    "steps_zh": [
      "识别故障域：映射所有可能独立失败的外部依赖和内部组件",
      "分区资源：为每个依赖或功能分配专用的线程池、连接池或计算实例",
      "为每个分区设限：配置每个舱壁隔间的最大并发请求数、队列深度和超时时间",
      "实现拒绝策略：定义舱壁满时的处理方式——立即拒绝、有界等待排队或负载丢弃",
      "监控隔间健康：追踪每个舱壁的利用率、拒绝率和队列深度以检测并调整隔间大小"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Service A",
      "Service B",
      "Service C",
      "Isolation"
    ],
    "viz_labels_zh": [
      "服务A",
      "服务B",
      "服务C",
      "隔离舱"
    ],
    "related": [
      "circuit-breaker-pattern",
      "chaos-engineering",
      "cell-based-architecture"
    ],
    "tags": [
      "bulkhead",
      "isolation",
      "resilience",
      "resource-partitioning"
    ],
    "origin_author": "Michael Nygard, 2007",
    "origin_source": "Release It! Design and Deploy Production-Ready Software",
    "origin_source_zh": "《发布！软件的设计与部署》",
    "complexity": "advanced",
    "when_to_use": [
      "Systems with multiple external dependencies where one slow dependency can exhaust shared resources",
      "Multi-tenant platforms where one tenant's traffic spike must not affect others",
      "Services with mixed workload priorities (critical vs. best-effort) requiring resource isolation",
      "Applications experiencing thread pool starvation during dependency outages"
    ],
    "when_to_use_zh": [
      "有多个外部依赖且一个慢依赖可能耗尽共享资源的系统",
      "一个租户的流量激增不能影响其他租户的多租户平台",
      "混合工作负载优先级（关键型与尽力型）需要资源隔离的服务",
      "在依赖中断期间经历线程池饥饿的应用"
    ],
    "core_concepts": [
      "Failure Domain Isolation: Preventing a failure in one component from consuming resources needed by others",
      "Thread Pool Bulkhead: Dedicating separate thread pools per dependency so one cannot starve another",
      "Semaphore Bulkhead: Using counting semaphores to limit concurrent access without dedicated thread pools",
      "Resource Partitioning: Allocating fixed resource budgets (connections, memory, CPU) per workload or tenant",
      "Graceful Degradation: Continuing to serve critical functions even when non-critical partitions are overwhelmed"
    ],
    "core_concepts_zh": [
      "故障域隔离：防止一个组件的故障消耗其他组件所需的资源",
      "线程池舱壁：为每个依赖分配专用线程池，使其无法饿死其他线程池",
      "信号量舱壁：使用计数信号量限制并发访问而无需专用线程池",
      "资源分区：为每个工作负载或租户分配固定资源预算（连接、内存、CPU）",
      "优雅降级：即使非关键分区过载也能继续服务关键功能"
    ],
    "timeline": [
      [
        "2007",
        "Michael Nygard introduces the Bulkhead pattern in 'Release It!' inspired by ship compartmentalization"
      ],
      [
        "2012",
        "Netflix Hystrix implements thread pool isolation as a core bulkhead mechanism"
      ],
      [
        "2016",
        "Kubernetes resource limits and requests enable infrastructure-level bulkheading"
      ],
      [
        "2018",
        "Resilience4j provides lightweight bulkhead implementations for Java microservices"
      ],
      [
        "2021",
        "AWS Cell-based Architecture formalizes bulkheading at the infrastructure level for hyperscale systems"
      ]
    ],
    "timeline_zh": [
      [
        "2007",
        "Michael Nygard在《发布！》中受船舶隔舱启发引入舱壁模式"
      ],
      [
        "2012",
        "Netflix Hystrix实现线程池隔离作为核心舱壁机制"
      ],
      [
        "2016",
        "Kubernetes的资源限制和请求实现了基础设施级的舱壁隔离"
      ],
      [
        "2018",
        "Resilience4j为Java微服务提供轻量级舱壁实现"
      ],
      [
        "2021",
        "AWS蜂窝架构在基础设施层面正式化了超大规模系统的舱壁隔离"
      ]
    ],
    "dos": [
      "Do size bulkheads based on measured traffic patterns because under-provisioned partitions cause unnecessary rejections",
      "Do combine bulkheads with circuit breakers because they complement each other for comprehensive fault tolerance",
      "Do implement per-tenant bulkheads in multi-tenant systems because noisy neighbors are a top reliability risk",
      "Do monitor bulkhead utilization because consistently full bulkheads indicate under-provisioning or dependency issues"
    ],
    "dos_zh": [
      "根据实际流量模式调整舱壁大小，因为配置不足的分区会导致不必要的拒绝",
      "将舱壁与熔断器结合使用，因为两者相互补充以实现全面容错",
      "在多租户系统中实现每租户舱壁，因为嘈杂邻居是首要可靠性风险",
      "监控舱壁利用率，因为持续满载的舱壁表明资源不足或依赖问题"
    ],
    "donts": [
      "Don't create too many fine-grained partitions because overhead and complexity outweigh isolation benefits",
      "Don't share thread pools across critical and non-critical paths because this defeats the isolation purpose",
      "Don't set bulkhead limits without load testing because guessed limits are usually wrong",
      "Don't forget to handle rejection gracefully because rejected requests should return meaningful errors, not stack traces"
    ],
    "donts_zh": [
      "不要创建过多细粒度的分区，因为开销和复杂性会超过隔离收益",
      "不要在关键路径和非关键路径之间共享线程池，因为这违背了隔离目的",
      "不要在没有负载测试的情况下设置舱壁限制，因为猜测的限制通常是错误的",
      "不要忘记优雅处理拒绝，因为被拒绝的请求应返回有意义的错误而非堆栈跟踪"
    ],
    "case_study_company": "Amazon",
    "case_study": "Amazon uses bulkhead patterns extensively through their Cell-based Architecture, where each cell is an independent, isolated deployment serving a subset of customers. During the 2021 US-EAST-1 disruption, cells with proper bulkheading continued operating while affected cells were isolated, preventing a total service outage and limiting the blast radius of the failure.",
    "case_study_zh": "Amazon通过其蜂窝架构广泛使用舱壁模式，每个蜂窝单元是一个独立隔离的部署，服务于客户子集。在2021年US-EAST-1中断期间，具有适当舱壁隔离的蜂窝单元继续运行，而受影响的单元被隔离，防止了全面服务中断并限制了故障影响范围。",
    "when_not_to_use": [
      "Simple applications with a single dependency where isolation adds complexity without benefit",
      "Low-traffic systems where resource contention is unlikely to occur",
      "Stateless serverless functions that inherently isolate through per-invocation containers"
    ],
    "when_not_to_use_zh": [
      "只有单一依赖的简单应用，隔离增加复杂性而无收益",
      "不太可能发生资源争用的低流量系统",
      "通过每次调用独立容器天然隔离的无状态Serverless函数"
    ],
    "adopters": [
      "Amazon",
      "Netflix",
      "Microsoft Azure",
      "Alibaba Cloud",
      "Uber"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Michael Nygard (2007). \"Release It! Design and Deploy Production-Ready Software\". Pragmatic Bookshelf.",
    "secondary_sources": [
      "Michael Nygard (2018). \"Release It! Design and Deploy Production-Ready Software, 2nd Edition\". Pragmatic Bookshelf.",
      "Sam Newman (2015). \"Building Microservices: Designing Fine-Grained Systems\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "circuit-breaker-pattern",
        "type": "complement"
      },
      {
        "slug": "chaos-engineering",
        "type": "complement"
      },
      {
        "slug": "cell-based-architecture",
        "type": "related"
      }
    ]
  },
  {
    "id": 52,
    "name": "USE Method",
    "name_zh": "USE 方法",
    "slug": "use-method",
    "category": "quality",
    "desc": "Check Utilization, Saturation, Errors for every resource",
    "desc_zh": "对每项资源检查利用率、饱和度和错误率",
    "steps": [
      "Enumerate all resources: list every hardware and software resource (CPU, memory, disk, network, thread pools, connection pools)",
      "For each resource, measure Utilization: the percentage of time or capacity the resource is actively busy serving work",
      "Measure Saturation: the degree to which extra work is queued or waiting because the resource is at capacity",
      "Measure Errors: count error events related to each resource (disk I/O errors, network timeouts, OOM kills, connection refused)",
      "Iterate systematically: work through the full resource list using USE for each; this methodical sweep avoids overlooking bottlenecks"
    ],
    "steps_zh": [
      "枚举所有资源：列出每项硬件和软件资源（CPU、内存、磁盘、网络、线程池、连接池）",
      "对每项资源测量利用率：资源积极忙于服务工作的时间或容量百分比",
      "测量饱和度：因资源达到容量上限而有额外工作排队或等待的程度",
      "测量错误：统计每项资源相关的错误事件（磁盘I/O错误、网络超时、OOM终止、连接拒绝）",
      "系统化迭代：对完整资源列表逐一使用USE方法；这种系统性扫描避免遗漏瓶颈"
    ],
    "ai_relevant": false,
    "viz_type": "radar",
    "viz_labels": [
      "Utilization",
      "Saturation",
      "Errors"
    ],
    "viz_labels_zh": [
      "利用率",
      "饱和度",
      "错误率"
    ],
    "related": [
      "red-method",
      "four-golden-signals",
      "sli-slo-sla"
    ],
    "tags": [
      "use-method",
      "utilization",
      "saturation",
      "errors",
      "performance"
    ],
    "origin_author": "Brendan Gregg, 2012",
    "origin_source": "The USE Method (brendangregg.com blog post)",
    "origin_source_zh": "《USE方法》（brendangregg.com博客文章）",
    "complexity": "intermediate",
    "when_to_use": [
      "Diagnosing performance problems when you do not know where the bottleneck is",
      "Capacity planning to identify which resources are approaching saturation",
      "Setting up monitoring dashboards for infrastructure and server resources",
      "Post-incident analysis to systematically identify the root cause of resource-related outages"
    ],
    "when_to_use_zh": [
      "诊断性能问题但不知道瓶颈在哪里时",
      "容量规划以识别哪些资源接近饱和",
      "为基础设施和服务器资源建立监控仪表盘",
      "事后分析以系统化识别资源相关故障的根本原因"
    ],
    "core_concepts": [
      "Utilization: The percentage of time a resource is busy or the proportion of its capacity being used",
      "Saturation: The extent to which work is queued waiting for a resource, indicating capacity has been exceeded",
      "Errors: Error events per resource that may indicate hardware degradation or misconfiguration",
      "Resource List: A comprehensive enumeration of all system resources to ensure nothing is overlooked",
      "Functional Block Diagram: A visual map of system components and their resources used to guide the USE analysis"
    ],
    "core_concepts_zh": [
      "利用率：资源忙碌时间的百分比或其容量被使用的比例",
      "饱和度：工作在资源处排队等待的程度，表明容量已被超过",
      "错误：每个资源的错误事件，可能表明硬件退化或配置错误",
      "资源清单：系统所有资源的全面枚举，确保没有遗漏",
      "功能框图：系统组件及其资源的可视化映射，用于指导USE分析"
    ],
    "timeline": [
      [
        "2012",
        "Brendan Gregg publishes 'The USE Method' blog post while working at Joyent"
      ],
      [
        "2013",
        "Gregg publishes 'Systems Performance: Enterprise and the Cloud' expanding on the method"
      ],
      [
        "2014",
        "USE Method becomes a standard troubleshooting methodology taught at performance engineering conferences"
      ],
      [
        "2017",
        "Netflix adopts USE Method as part of their standard performance analysis workflow"
      ],
      [
        "2020",
        "Gregg publishes the second edition of 'Systems Performance' with updated USE Method guidance for cloud-native systems"
      ]
    ],
    "timeline_zh": [
      [
        "2012",
        "Brendan Gregg在Joyent工作时发表「USE方法」博客文章"
      ],
      [
        "2013",
        "Gregg出版《系统性能：企业与云》一书，扩展了该方法"
      ],
      [
        "2014",
        "USE方法成为性能工程会议上教授的标准故障排除方法论"
      ],
      [
        "2017",
        "Netflix将USE方法纳入其标准性能分析工作流"
      ],
      [
        "2020",
        "Gregg出版《系统性能》第二版，包含针对云原生系统的更新USE方法指南"
      ]
    ],
    "dos": [
      "Do create a complete resource checklist before starting because missing a resource means missing a potential bottleneck",
      "Do check errors first for each resource because error diagnosis is often the fastest path to finding issues",
      "Do measure saturation alongside utilization because high utilization alone does not always indicate a problem",
      "Do use the method iteratively because new resources are added as systems evolve"
    ],
    "dos_zh": [
      "开始前创建完整的资源检查清单，因为遗漏一个资源意味着遗漏潜在瓶颈",
      "对每个资源优先检查错误，因为错误诊断通常是发现问题的最快路径",
      "将饱和度与利用率一起测量，因为仅高利用率并不总是表明存在问题",
      "迭代使用该方法，因为随着系统演进会添加新资源"
    ],
    "donts": [
      "Don't skip resources because the bottleneck is often in the resource you least suspect",
      "Don't confuse utilization with saturation because a resource at 90% utilization may have zero saturation",
      "Don't only look at averages because averages hide spikes and tail latency problems",
      "Don't apply USE to request-based workloads because the RED Method is better suited for that"
    ],
    "donts_zh": [
      "不要跳过资源，因为瓶颈通常在你最不怀疑的资源上",
      "不要混淆利用率和饱和度，因为90%利用率的资源可能饱和度为零",
      "不要只看平均值，因为平均值隐藏了尖峰和尾延迟问题",
      "不要将USE应用于基于请求的工作负载，因为RED方法更适合"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix's performance engineering team adopted the USE Method as their standard approach for diagnosing performance issues across their cloud infrastructure. When a streaming service experienced intermittent latency spikes, systematic USE analysis revealed that network interface saturation on specific instance types was the root cause, not the CPU or memory bottlenecks initially suspected.",
    "case_study_zh": "Netflix的性能工程团队采用USE方法作为诊断云基础设施性能问题的标准方法。当流媒体服务出现间歇性延迟尖峰时，系统化的USE分析揭示了特定实例类型上网络接口饱和才是根本原因，而非最初怀疑的CPU或内存瓶颈。",
    "when_not_to_use": [
      "Application-layer request debugging where RED Method provides more relevant signals",
      "User experience monitoring where latency and error rates matter more than resource utilization",
      "Serverless architectures where underlying resources are abstracted away from the operator"
    ],
    "when_not_to_use_zh": [
      "应用层请求调试，RED方法能提供更相关的信号",
      "延迟和错误率比资源利用率更重要的用户体验监控",
      "底层资源对运维人员透明的Serverless架构"
    ],
    "adopters": [
      "Netflix",
      "Facebook",
      "LinkedIn",
      "Joyent",
      "Cloudflare"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "observability",
      "performance"
    ],
    "maturity_ring": "established",
    "primary_source": "Brendan Gregg (2012). \"The USE Method\". brendangregg.com.",
    "secondary_sources": [
      "Brendan Gregg (2013). \"Systems Performance: Enterprise and the Cloud\". Prentice Hall.",
      "Brendan Gregg (2020). \"Systems Performance: Enterprise and the Cloud, 2nd Edition\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "red-method",
        "type": "alternative"
      },
      {
        "slug": "four-golden-signals",
        "type": "alternative"
      },
      {
        "slug": "sli-slo-sla",
        "type": "complement"
      }
    ]
  },
  {
    "id": 53,
    "name": "RED Method",
    "name_zh": "RED 方法",
    "slug": "red-method",
    "category": "quality",
    "desc": "Monitor Request rate, Error rate, Duration for each service",
    "desc_zh": "对每个服务监控请求率、错误率和持续时间",
    "steps": [
      "Identify all services: list every microservice, API endpoint, and background worker that handles requests",
      "Measure Rate: instrument each service to emit requests-per-second metrics broken down by endpoint and status code",
      "Measure Errors: track the number and percentage of requests that result in errors (5xx, timeouts, business errors)",
      "Measure Duration: capture request latency distributions (p50, p95, p99) to understand both typical and tail performance",
      "Build dashboards and alerts: create per-service RED dashboards; set alerts on error rate spikes and latency degradation"
    ],
    "steps_zh": [
      "识别所有服务：列出每个处理请求的微服务、API端点和后台工作器",
      "测量速率：为每个服务埋点，按端点和状态码分类发出每秒请求数指标",
      "测量错误：追踪导致错误的请求数量和百分比（5xx、超时、业务错误）",
      "测量持续时间：捕获请求延迟分布（p50、p95、p99）以理解典型和尾部性能",
      "构建仪表盘和告警：创建每服务RED仪表盘；对错误率飙升和延迟劣化设置告警"
    ],
    "ai_relevant": false,
    "viz_type": "radar",
    "viz_labels": [
      "Rate",
      "Errors",
      "Duration"
    ],
    "viz_labels_zh": [
      "请求速率",
      "错误率",
      "响应时延"
    ],
    "related": [
      "use-method",
      "four-golden-signals",
      "dora-metrics"
    ],
    "tags": [
      "red-method",
      "request-rate",
      "error-rate",
      "duration",
      "monitoring"
    ],
    "origin_author": "Tom Wilkie, 2018",
    "origin_source": "The RED Method: How to Instrument Your Services (Grafana Labs blog/talk)",
    "origin_source_zh": "《RED方法：如何为你的服务埋点》（Grafana Labs博客/演讲）",
    "complexity": "beginner",
    "when_to_use": [
      "Monitoring microservices where request-centric metrics provide the most actionable signals",
      "Building SRE dashboards that need to answer 'is this service healthy?' at a glance",
      "Establishing baseline monitoring for any new service deployed to production",
      "Debugging production issues where you need to quickly identify which service is degraded"
    ],
    "when_to_use_zh": [
      "以请求为中心的指标能提供最可操作信号的微服务监控",
      "需要一眼回答「这个服务健康吗？」的SRE仪表盘构建",
      "为任何部署到生产的新服务建立基线监控",
      "需要快速识别哪个服务降级的生产问题调试"
    ],
    "core_concepts": [
      "Rate: The number of requests per second a service is handling, indicating demand and throughput",
      "Errors: The count or percentage of requests that fail, indicating service correctness issues",
      "Duration: The distribution of request latency (p50, p95, p99), indicating service performance",
      "Service-Centric Monitoring: Focusing on the work a service does (requests) rather than the resources it uses",
      "Percentile Latency: Using percentiles instead of averages to expose tail latency and worst-case user experience"
    ],
    "core_concepts_zh": [
      "速率：服务每秒处理的请求数，表示需求和吞吐量",
      "错误：失败请求的计数或百分比，表示服务正确性问题",
      "持续时间：请求延迟的分布（p50、p95、p99），表示服务性能",
      "以服务为中心的监控：关注服务所做的工作（请求）而非使用的资源",
      "百分位延迟：使用百分位数而非平均值来暴露尾延迟和最差用户体验"
    ],
    "timeline": [
      [
        "2015",
        "Tom Wilkie begins developing the RED Method at Weaveworks based on experience with microservices monitoring"
      ],
      [
        "2018",
        "Wilkie formally presents the RED Method at KubeCon and in the Grafana Labs blog"
      ],
      [
        "2019",
        "RED becomes a standard monitoring methodology in the Kubernetes and cloud-native community"
      ],
      [
        "2020",
        "Grafana, Datadog, and New Relic build RED-aligned dashboard templates into their platforms"
      ],
      [
        "2022",
        "RED Method is widely taught alongside USE and Golden Signals in SRE training programs"
      ]
    ],
    "timeline_zh": [
      [
        "2015",
        "Tom Wilkie在Weaveworks根据微服务监控经验开始开发RED方法"
      ],
      [
        "2018",
        "Wilkie在KubeCon和Grafana Labs博客上正式发表RED方法"
      ],
      [
        "2019",
        "RED成为Kubernetes和云原生社区的标准监控方法论"
      ],
      [
        "2020",
        "Grafana、Datadog和New Relic将RED对齐的仪表盘模板内置于其平台"
      ],
      [
        "2022",
        "RED方法在SRE培训项目中与USE和黄金信号并列广泛教授"
      ]
    ],
    "dos": [
      "Do track latency distributions (percentiles) because averages hide tail latency affecting real users",
      "Do separate error types (client vs server, transient vs permanent) because they require different responses",
      "Do set up RED dashboards for every service before going to production because retroactive instrumentation is harder",
      "Do correlate RED metrics with deployment events because many degradations are caused by code changes"
    ],
    "dos_zh": [
      "追踪延迟分布（百分位数），因为平均值隐藏了影响真实用户的尾延迟",
      "区分错误类型（客户端/服务端、瞬时/永久），因为它们需要不同的响应",
      "在上线前为每个服务建立RED仪表盘，因为回溯性埋点更困难",
      "将RED指标与部署事件关联，因为许多降级由代码变更导致"
    ],
    "donts": [
      "Don't use only averages for duration because they conceal p99 spikes that affect user experience",
      "Don't ignore rate changes because sudden drops in request rate often indicate upstream failures",
      "Don't alert on every metric independently because correlated alerts (rate drops + error spikes) are more actionable",
      "Don't apply RED to infrastructure resources because USE Method is designed for that purpose"
    ],
    "donts_zh": [
      "不要只用平均值衡量持续时间，因为平均值掩盖了影响用户体验的p99尖峰",
      "不要忽略速率变化，因为请求率的突然下降通常表示上游故障",
      "不要对每个指标独立告警，因为相关联的告警（速率下降+错误激增）更具可操作性",
      "不要将RED应用于基础设施资源，因为USE方法是为此目的设计的"
    ],
    "case_study_company": "Grafana Labs",
    "case_study": "Grafana Labs uses the RED Method internally to monitor their own Grafana Cloud platform, which serves millions of dashboards and alerts. By standardizing on RED metrics across all microservices, their SRE team can diagnose cross-service issues in minutes. When a spike in error rates was detected in their alerting pipeline, RED dashboards immediately pinpointed the degraded service and specific endpoint.",
    "case_study_zh": "Grafana Labs在内部使用RED方法监控其服务数百万仪表盘和告警的Grafana Cloud平台。通过在所有微服务上标准化RED指标，他们的SRE团队能在几分钟内诊断跨服务问题。当告警管道中检测到错误率尖峰时，RED仪表盘立即定位了降级的服务和具体端点。",
    "when_not_to_use": [
      "Infrastructure and hardware monitoring where resource utilization matters more than request metrics",
      "Batch processing or streaming systems that do not follow a request-response pattern",
      "Storage systems where disk and memory metrics are more relevant than request-level signals"
    ],
    "when_not_to_use_zh": [
      "资源利用率比请求指标更重要的基础设施和硬件监控",
      "不遵循请求-响应模式的批处理或流处理系统",
      "磁盘和内存指标比请求级信号更相关的存储系统"
    ],
    "adopters": [
      "Grafana Labs",
      "Weaveworks",
      "Google",
      "Shopify",
      "Cloudflare"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "observability",
      "performance"
    ],
    "maturity_ring": "established",
    "primary_source": "Tom Wilkie (2018). \"The RED Method: How to Instrument Your Services\". Grafana Labs blog / KubeCon talk.",
    "secondary_sources": [
      "Cindy Sridharan (2018). \"Distributed Systems Observability\". O'Reilly Media.",
      "Brendan Gregg (2012). \"The USE Method\". brendangregg.com."
    ],
    "typed_relations": [
      {
        "slug": "use-method",
        "type": "alternative"
      },
      {
        "slug": "four-golden-signals",
        "type": "alternative"
      },
      {
        "slug": "dora-metrics",
        "type": "complement"
      }
    ]
  },
  {
    "id": 54,
    "name": "Four Golden Signals",
    "name_zh": "四大黄金信号",
    "slug": "four-golden-signals",
    "category": "quality",
    "desc": "Monitor Latency, Traffic, Errors, Saturation for any service",
    "desc_zh": "对任何服务监控延迟、流量、错误率和饱和度",
    "steps": [
      "Instrument Latency: measure the time it takes to serve a request, separating successful and failed request latencies",
      "Track Traffic: measure the demand on your system — requests per second, transactions per minute, or messages consumed",
      "Monitor Errors: count the rate of requests that fail, distinguishing explicit errors (500s) from implicit ones (wrong content, slow responses)",
      "Measure Saturation: track how close the service is to capacity — CPU usage, memory pressure, queue depths, thread pool exhaustion",
      "Correlate signals: build dashboards that show all four signals together so you can quickly identify whether an issue is load, code, or resource-related"
    ],
    "steps_zh": [
      "埋点延迟：测量服务请求的处理时间，区分成功请求和失败请求的延迟",
      "追踪流量：衡量系统承受的需求——每秒请求数、每分钟事务数或消费的消息数",
      "监控错误：统计失败请求的速率，区分显式错误（500）和隐式错误（内容错误、响应过慢）",
      "测量饱和度：追踪服务距离容量上限的距离——CPU使用率、内存压力、队列深度、线程池耗尽",
      "关联信号：构建同时展示四个信号的仪表盘，以快速判断问题源于负载、代码还是资源"
    ],
    "ai_relevant": false,
    "viz_type": "radar",
    "viz_labels": [
      "Latency",
      "Traffic",
      "Errors",
      "Saturation"
    ],
    "viz_labels_zh": [
      "延迟",
      "流量",
      "错误",
      "饱和度"
    ],
    "related": [
      "red-method",
      "use-method",
      "sli-slo-sla"
    ],
    "tags": [
      "golden-signals",
      "latency",
      "traffic",
      "errors",
      "saturation"
    ],
    "origin_author": "Google SRE Team, 2016",
    "origin_source": "Site Reliability Engineering: How Google Runs Production Systems (O'Reilly)",
    "origin_source_zh": "《SRE：Google运维解密》（O'Reilly）",
    "complexity": "beginner",
    "when_to_use": [
      "Establishing foundational monitoring for any production service regardless of architecture",
      "Training new SRE or DevOps team members on what to monitor first",
      "Creating on-call dashboards that need to surface problems quickly for incident responders",
      "Defining SLIs when you need a starting point for service level indicators"
    ],
    "when_to_use_zh": [
      "为任何生产服务建立基础监控，不论架构如何",
      "培训新SRE或DevOps团队成员首先应监控什么",
      "创建需要为值班人员快速呈现问题的事件响应仪表盘",
      "需要服务级别指标起点时定义SLI"
    ],
    "core_concepts": [
      "Latency: The time it takes to service a request, differentiated between success and failure responses",
      "Traffic: A measure of demand being placed on the system (requests per second, I/O rates, sessions)",
      "Errors: The rate of requests that fail either explicitly (HTTP 5xx) or implicitly (wrong content, policy violations)",
      "Saturation: How full the service is, measuring the most constrained resource and predicting impending capacity limits",
      "Signal Correlation: Analyzing all four signals together to distinguish between load, code, and infrastructure issues"
    ],
    "core_concepts_zh": [
      "延迟：服务请求所花时间，区分成功和失败响应",
      "流量：系统所承受的需求度量（每秒请求数、I/O速率、会话数）",
      "错误：显式（HTTP 5xx）或隐式（内容错误、策略违规）失败的请求速率",
      "饱和度：服务有多满，测量最受限的资源并预测即将到来的容量上限",
      "信号关联：同时分析四个信号以区分负载、代码和基础设施问题"
    ],
    "timeline": [
      [
        "2003",
        "Google's internal SRE practices begin crystallizing around key monitoring signals"
      ],
      [
        "2016",
        "Google publishes the SRE book featuring the Four Golden Signals as Chapter 6"
      ],
      [
        "2017",
        "The SRE Workbook provides detailed implementation guidance for golden signals"
      ],
      [
        "2019",
        "Major observability platforms (Datadog, New Relic, Grafana) build golden signals templates"
      ],
      [
        "2022",
        "Four Golden Signals becomes the most widely cited monitoring framework in the industry"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "Google的内部SRE实践开始围绕关键监控信号固化"
      ],
      [
        "2016",
        "Google出版SRE书籍，在第6章介绍四大黄金信号"
      ],
      [
        "2017",
        "SRE工作手册为黄金信号提供了详细的实施指南"
      ],
      [
        "2019",
        "主要可观测性平台（Datadog、New Relic、Grafana）构建黄金信号模板"
      ],
      [
        "2022",
        "四大黄金信号成为业界引用最广泛的监控框架"
      ]
    ],
    "dos": [
      "Do separate latency for successful and failed requests because failed requests can be artificially fast",
      "Do track saturation proactively because it predicts future problems before they cause incidents",
      "Do build a single dashboard showing all four signals per service because correlation enables faster diagnosis",
      "Do define explicit error budgets for each signal because thresholds without budgets lead to alert fatigue"
    ],
    "dos_zh": [
      "分别统计成功和失败请求的延迟，因为失败请求可能人为地很快",
      "主动追踪饱和度，因为它能在引发事故前预测未来问题",
      "为每个服务构建展示全部四个信号的单一仪表盘，因为关联性能加速诊断",
      "为每个信号定义明确的错误预算，因为没有预算的阈值会导致告警疲劳"
    ],
    "donts": [
      "Don't monitor only latency and errors because saturation and traffic provide essential context for interpreting them",
      "Don't use averages exclusively because they hide the tail latency experienced by the worst-affected users",
      "Don't create separate unrelated dashboards for each signal because the power is in seeing them correlated",
      "Don't treat all errors equally because client errors (4xx) and server errors (5xx) have very different implications"
    ],
    "donts_zh": [
      "不要仅监控延迟和错误，因为饱和度和流量为解读它们提供了重要上下文",
      "不要只使用平均值，因为平均值隐藏了受影响最严重用户经历的尾延迟",
      "不要为每个信号创建独立的不相关仪表盘，因为关联展示才有力量",
      "不要平等对待所有错误，因为客户端错误（4xx）和服务端错误（5xx）含义非常不同"
    ],
    "case_study_company": "Google",
    "case_study": "Google developed the Four Golden Signals from over a decade of running production services at massive scale. The approach proved its value during a Gmail latency incident where traffic metrics showed normal request rates but saturation metrics revealed thread pool exhaustion on a specific cluster. Without the saturation signal, the team would have spent hours chasing code-level bugs instead of the resource constraint.",
    "case_study_zh": "Google从十多年大规模运行生产服务的经验中发展出四大黄金信号。该方法在一次Gmail延迟事故中证明了其价值：流量指标显示请求率正常，但饱和度指标揭示了特定集群的线程池耗尽。如果没有饱和度信号，团队会花数小时追查代码级缺陷，而非资源约束。",
    "when_not_to_use": [
      "Low-level hardware debugging where OS-level metrics (USE Method) provide more granular signals",
      "Client-side performance monitoring where Web Vitals and user experience metrics are more appropriate",
      "Offline batch processing where request-oriented signals do not apply"
    ],
    "when_not_to_use_zh": [
      "操作系统级指标（USE方法）能提供更细粒度信号的底层硬件调试",
      "Web Vitals和用户体验指标更合适的客户端性能监控",
      "面向请求的信号不适用的离线批处理"
    ],
    "adopters": [
      "Google",
      "Dropbox",
      "Twitter",
      "Uber",
      "Stripe"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "observability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Betsy Beyer, Chris Jones, Jennifer Petoff, and Niall Richard Murphy (2016). \"Site Reliability Engineering: How Google Runs Production Systems\". O'Reilly Media. Chapter 6.",
    "secondary_sources": [
      "Betsy Beyer et al. (2018). \"The Site Reliability Workbook: Practical Ways to Implement SRE\". O'Reilly Media.",
      "Niall Richard Murphy et al. (2020). \"Monitoring and Observability\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "red-method",
        "type": "alternative"
      },
      {
        "slug": "use-method",
        "type": "alternative"
      },
      {
        "slug": "sli-slo-sla",
        "type": "complement"
      }
    ]
  },
  {
    "id": 55,
    "name": "LLM Evaluation Framework",
    "name_zh": "大模型评估框架",
    "slug": "llm-evaluation-framework",
    "category": "quality",
    "desc": "Systematically evaluate LLM output quality and reliability",
    "desc_zh": "系统化评估大语言模型输出的质量与可靠性",
    "steps": [
      "Build golden datasets: curate labeled examples with expected outputs covering core use cases, edge cases, and known failure modes",
      "Define quality dimensions: establish metrics for accuracy, relevance, faithfulness (no hallucination), format compliance, and safety",
      "Implement automated evaluators: use LLM-as-judge, embedding similarity, regex validators, and custom rubrics to score each dimension",
      "Run regression suites: execute evaluations automatically on every prompt change, model upgrade, or RAG pipeline modification",
      "Analyze and iterate: review score trends, investigate regressions, and use failure analysis to drive prompt, retrieval, or model improvements"
    ],
    "steps_zh": [
      "构建黄金数据集：整理覆盖核心用例、边界情况和已知故障模式的标注示例及期望输出",
      "定义质量维度：建立准确性、相关性、忠实度（无幻觉）、格式合规性和安全性的指标",
      "实现自动化评估器：使用LLM作为裁判、嵌入相似度、正则验证器和自定义评分准则对每个维度打分",
      "运行回归套件：在每次提示词变更、模型升级或RAG管道修改时自动执行评估",
      "分析与迭代：审查分数趋势，调查回归问题，利用失败分析驱动提示词、检索或模型改进"
    ],
    "ai_relevant": true,
    "viz_type": "radar",
    "viz_labels": [
      "Accuracy",
      "Safety",
      "Latency",
      "Cost",
      "Coherence"
    ],
    "viz_labels_zh": [
      "准确性",
      "安全性",
      "延迟",
      "成本",
      "连贯性"
    ],
    "related": [
      "ai-output-verification",
      "prompt-testing",
      "ai-observability-framework"
    ],
    "tags": [
      "llm-eval",
      "benchmarks",
      "quality-metrics",
      "automated-evaluation",
      "ai"
    ],
    "origin_author": "OpenAI / Anthropic / Google, 2023",
    "origin_source": "HELM: Holistic Evaluation of Language Models (Stanford CRFM, 2022) and industry eval frameworks",
    "origin_source_zh": "《HELM：语言模型整体评估》（斯坦福CRFM，2022）及行业评估框架",
    "complexity": "advanced",
    "when_to_use": [
      "Deploying LLM-powered features to production where output quality must be measurable and tracked",
      "Comparing model versions or providers to make data-driven upgrade decisions",
      "Building RAG pipelines where retrieval and generation quality both need monitoring",
      "Establishing quality gates in CI/CD for prompt engineering workflows"
    ],
    "when_to_use_zh": [
      "将LLM功能部署到输出质量必须可衡量和可追踪的生产环境",
      "比较模型版本或供应商以做出数据驱动的升级决策",
      "构建检索和生成质量都需要监控的RAG管道",
      "在提示词工程工作流的CI/CD中建立质量门控"
    ],
    "core_concepts": [
      "Golden Dataset: A curated set of input-output pairs used as ground truth for evaluation",
      "LLM-as-Judge: Using a language model to evaluate another model's output against defined criteria",
      "Faithfulness: The degree to which generated content is grounded in provided source material without hallucination",
      "Evaluation Rubric: A structured scoring guide defining what constitutes good, acceptable, and poor output",
      "Regression Testing: Running evaluations on every change to detect quality degradation before deployment"
    ],
    "core_concepts_zh": [
      "黄金数据集：作为评估基准真相的精选输入-输出对集合",
      "LLM作为裁判：使用语言模型根据定义的标准评估另一个模型的输出",
      "忠实度：生成内容基于提供的源材料且无幻觉的程度",
      "评估准则：定义优良、可接受和差劣输出的结构化评分指南",
      "回归测试：在每次变更时运行评估以在部署前检测质量退化"
    ],
    "timeline": [
      [
        "2020",
        "OpenAI introduces early evaluation benchmarks for GPT-3 measuring accuracy across tasks"
      ],
      [
        "2022",
        "Stanford CRFM releases HELM providing holistic multi-metric evaluation of language models"
      ],
      [
        "2023",
        "Braintrust, Langsmith, and Promptfoo emerge as dedicated LLM evaluation platforms"
      ],
      [
        "2024",
        "LLM-as-Judge becomes the dominant evaluation pattern, validated by research from Anthropic and Google"
      ],
      [
        "2025",
        "Evaluation frameworks mature to include multi-turn agent evaluation and tool-use correctness"
      ]
    ],
    "timeline_zh": [
      [
        "2020",
        "OpenAI为GPT-3引入早期评估基准，衡量跨任务准确性"
      ],
      [
        "2022",
        "斯坦福CRFM发布HELM，提供语言模型的整体多指标评估"
      ],
      [
        "2023",
        "Braintrust、Langsmith和Promptfoo作为专用LLM评估平台涌现"
      ],
      [
        "2024",
        "LLM作为裁判成为主导评估模式，经Anthropic和Google研究验证"
      ],
      [
        "2025",
        "评估框架成熟到包含多轮智能体评估和工具使用正确性"
      ]
    ],
    "dos": [
      "Do build diverse golden datasets because biased test sets produce misleading evaluation scores",
      "Do evaluate multiple quality dimensions because a model can be accurate but unsafe or unfaithful",
      "Do version evaluations alongside prompts because evaluation criteria must evolve with the application",
      "Do include adversarial test cases because edge cases and attacks reveal real-world failure modes"
    ],
    "dos_zh": [
      "构建多样化的黄金数据集，因为有偏差的测试集会产生误导性评估分数",
      "评估多个质量维度，因为模型可能准确但不安全或不忠实",
      "将评估与提示词一起版本化，因为评估标准必须随应用演进",
      "包含对抗性测试用例，因为边界情况和攻击揭示了真实世界的失败模式"
    ],
    "donts": [
      "Don't rely solely on LLM-as-Judge because model judges have their own biases and failure modes",
      "Don't use a single metric because reducing quality to one number hides important failure dimensions",
      "Don't evaluate only happy paths because production failures occur on unexpected and adversarial inputs",
      "Don't skip human evaluation entirely because automated metrics can miss nuanced quality issues"
    ],
    "donts_zh": [
      "不要仅依赖LLM作为裁判，因为模型裁判有自身的偏见和失败模式",
      "不要使用单一指标，因为将质量压缩为一个数字会隐藏重要的失败维度",
      "不要只评估正向路径，因为生产故障发生在意外和对抗性输入上",
      "不要完全跳过人工评估，因为自动化指标可能遗漏细微的质量问题"
    ],
    "case_study_company": "Anthropic",
    "case_study": "Anthropic built extensive evaluation frameworks for Claude, including automated benchmarks across helpfulness, harmlessness, and honesty dimensions. Their evaluation suite runs thousands of test cases on every model iteration, enabling rapid detection of capability regressions and safety issues. This systematic approach allows them to ship model updates with high confidence that quality has not degraded.",
    "case_study_zh": "Anthropic为Claude构建了广泛的评估框架，包括跨有用性、无害性和诚实性维度的自动化基准。他们的评估套件在每次模型迭代时运行数千个测试用例，能快速检测能力回归和安全问题。这种系统化方法使他们能高信心地发布模型更新，确保质量未退化。",
    "when_not_to_use": [
      "Simple rule-based systems where deterministic output validation is sufficient",
      "One-off exploratory uses of LLMs where systematic evaluation adds overhead without ongoing value",
      "Early prototyping phases where the evaluation criteria themselves are not yet defined"
    ],
    "when_not_to_use_zh": [
      "确定性输出验证足够的简单规则系统",
      "LLM的一次性探索使用，系统评估增加开销但无持续价值",
      "评估标准本身尚未定义的早期原型阶段"
    ],
    "adopters": [
      "Anthropic",
      "OpenAI",
      "Google DeepMind",
      "Microsoft",
      "Braintrust"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "testability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Percy Liang et al. (2022). \"Holistic Evaluation of Language Models (HELM)\". Stanford CRFM.",
    "secondary_sources": [
      "Yuntao Bai et al. (2022). \"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback\". Anthropic.",
      "Dan Hendrycks et al. (2021). \"Measuring Massive Multitask Language Understanding (MMLU)\". ICLR 2021."
    ],
    "typed_relations": [
      {
        "slug": "ai-output-verification",
        "type": "complement"
      },
      {
        "slug": "prompt-testing",
        "type": "complement"
      },
      {
        "slug": "ai-observability-framework",
        "type": "complement"
      }
    ]
  },
  {
    "id": 56,
    "name": "AI Output Verification",
    "name_zh": "AI 输出验证",
    "slug": "ai-output-verification",
    "category": "quality",
    "desc": "Multi-layer checks ensuring AI-generated content is trustworthy",
    "desc_zh": "多层检查确保AI生成内容的可信度与正确性",
    "steps": [
      "Schema validation: verify that AI output conforms to the expected structure (JSON schema, type checks, required fields)",
      "Factual grounding check: cross-reference generated claims against retrieved source documents or knowledge bases",
      "Consistency verification: compare the output against the input constraints and previously generated outputs for logical coherence",
      "Safety and policy filtering: run the output through toxicity classifiers, PII detectors, and domain-specific policy rules",
      "Human spot-check sampling: randomly route a percentage of outputs to human reviewers and use disagreements to improve automated checks"
    ],
    "steps_zh": [
      "模式验证：验证AI输出符合预期结构（JSON Schema、类型检查、必填字段）",
      "事实依据检查：将生成的声明与检索的源文档或知识库交叉参照",
      "一致性验证：将输出与输入约束和先前生成的输出进行对比，检查逻辑连贯性",
      "安全与策略过滤：通过毒性分类器、PII检测器和领域特定策略规则过滤输出",
      "人工抽查：随机将一定比例的输出路由给人工审阅者，利用分歧改进自动化检查"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Generate",
      "Validate",
      "Filter",
      "Deliver"
    ],
    "viz_labels_zh": [
      "生成",
      "验证",
      "过滤",
      "交付"
    ],
    "related": [
      "llm-evaluation-framework",
      "guardrails-framework",
      "prompt-testing"
    ],
    "tags": [
      "verification",
      "hallucination-detection",
      "safety",
      "grounding",
      "ai"
    ],
    "origin_author": "Guardrails AI / NeMo Guardrails (NVIDIA), 2023",
    "origin_source": "Guardrails AI framework and NVIDIA NeMo Guardrails documentation",
    "origin_source_zh": "Guardrails AI框架及NVIDIA NeMo Guardrails文档",
    "complexity": "intermediate",
    "when_to_use": [
      "Production AI applications where incorrect outputs could cause financial, legal, or safety harm",
      "Customer-facing AI features where hallucinated content damages brand trust",
      "Regulated industries (healthcare, finance) where AI outputs must be auditable and verifiable",
      "RAG-based systems where generated answers must be traceable to source documents"
    ],
    "when_to_use_zh": [
      "错误输出可能造成财务、法律或安全损害的生产AI应用",
      "幻觉内容会损害品牌信任的面向客户的AI功能",
      "AI输出必须可审计和可验证的受监管行业（医疗、金融）",
      "生成答案必须可追溯到源文档的RAG系统"
    ],
    "core_concepts": [
      "Schema Validation: Structural verification ensuring AI output matches expected format, types, and required fields",
      "Factual Grounding: Cross-referencing generated claims against authoritative source documents to detect hallucinations",
      "Output Guardrails: Automated filters that block or modify outputs violating safety, policy, or quality rules",
      "PII Detection: Scanning generated content for personally identifiable information that should not be exposed",
      "Human-in-the-Loop: Routing a sample of outputs to human reviewers for quality assurance and model improvement"
    ],
    "core_concepts_zh": [
      "模式验证：结构性验证确保AI输出匹配预期格式、类型和必填字段",
      "事实依据：将生成的声明与权威源文档交叉参照以检测幻觉",
      "输出护栏：自动过滤违反安全、策略或质量规则的输出",
      "PII检测：扫描生成内容中不应暴露的个人身份信息",
      "人在回路中：将输出样本路由给人工审阅者以进行质量保证和模型改进"
    ],
    "timeline": [
      [
        "2022",
        "Hallucination detection emerges as a critical research area as LLMs deploy widely in production"
      ],
      [
        "2023",
        "Guardrails AI releases open-source framework for validating LLM outputs with programmable rules"
      ],
      [
        "2023",
        "NVIDIA releases NeMo Guardrails for controlling LLM conversational behavior and output safety"
      ],
      [
        "2024",
        "Major cloud providers (AWS Bedrock, Azure AI) build output verification into their managed LLM services"
      ],
      [
        "2025",
        "Multi-layer verification becomes standard practice with schema, grounding, safety, and human checks in pipeline"
      ]
    ],
    "timeline_zh": [
      [
        "2022",
        "随着LLM广泛部署到生产环境，幻觉检测成为关键研究领域"
      ],
      [
        "2023",
        "Guardrails AI发布开源框架，用可编程规则验证LLM输出"
      ],
      [
        "2023",
        "NVIDIA发布NeMo Guardrails用于控制LLM对话行为和输出安全"
      ],
      [
        "2024",
        "主要云提供商（AWS Bedrock、Azure AI）将输出验证内置到其托管LLM服务中"
      ],
      [
        "2025",
        "多层验证成为标准实践，在管道中包含模式、依据、安全和人工检查"
      ]
    ],
    "dos": [
      "Do implement verification as a pipeline with multiple independent layers because no single check catches everything",
      "Do ground-check factual claims against source documents because hallucinations are the top production risk for LLMs",
      "Do log all verification failures for analysis because patterns in failures drive systematic improvements",
      "Do include adversarial inputs in your test suite because users will probe boundaries in unexpected ways"
    ],
    "dos_zh": [
      "将验证实现为多个独立层的管道，因为没有单一检查能捕获所有问题",
      "将事实声明与源文档进行依据检查，因为幻觉是LLM的首要生产风险",
      "记录所有验证失败以供分析，因为失败模式驱动系统化改进",
      "在测试套件中包含对抗性输入，因为用户会以意想不到的方式探测边界"
    ],
    "donts": [
      "Don't deploy AI features without output verification because unverified outputs will hallucinate in production",
      "Don't rely solely on the LLM to self-verify because models are poor judges of their own hallucinations",
      "Don't treat verification as a one-time setup because new failure modes emerge as usage patterns evolve",
      "Don't block all outputs on strict rules because overly aggressive filtering degrades user experience"
    ],
    "donts_zh": [
      "不要在没有输出验证的情况下部署AI功能，因为未验证的输出会在生产中产生幻觉",
      "不要仅依赖LLM自我验证，因为模型不善于判断自身的幻觉",
      "不要将验证视为一次性设置，因为随着使用模式演进会出现新的失败模式",
      "不要用严格规则阻止所有输出，因为过于激进的过滤会降低用户体验"
    ],
    "case_study_company": "Bing Chat (Microsoft)",
    "case_study": "After Microsoft launched Bing Chat (now Copilot) in early 2023, several high-profile hallucination incidents prompted them to build robust output verification layers. They implemented factual grounding checks against Bing search results, citation verification, and safety classifiers. These multi-layer checks significantly reduced hallucination rates and inappropriate responses in subsequent releases.",
    "case_study_zh": "微软在2023年初推出Bing Chat（现为Copilot）后，几次高调的幻觉事件促使他们构建了强大的输出验证层。他们实现了基于Bing搜索结果的事实依据检查、引用验证和安全分类器。这些多层检查在后续版本中显著降低了幻觉率和不当响应。",
    "when_not_to_use": [
      "Creative writing or brainstorming use cases where factual grounding is not required",
      "Internal developer tools where the cost of occasional errors is low and human review is built in",
      "Prototype or demo environments where verification overhead is not justified"
    ],
    "when_not_to_use_zh": [
      "不需要事实依据的创意写作或头脑风暴场景",
      "偶尔出错成本低且已内置人工审查的内部开发工具",
      "验证开销不合理的原型或演示环境"
    ],
    "adopters": [
      "Microsoft",
      "Google",
      "Anthropic",
      "Amazon",
      "Salesforce"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "security"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Shreya Rajpal (2023). \"Guardrails AI: Adding Guardrails to Large Language Models\". github.com/guardrails-ai.",
    "secondary_sources": [
      "NVIDIA (2023). \"NeMo Guardrails: A Toolkit for Controllable and Safe LLM Applications with Programmable Rails\". arXiv:2310.10501.",
      "Anthropic (2023). \"Constitutional AI: Harmlessness from AI Feedback\". arXiv:2212.08073."
    ],
    "typed_relations": [
      {
        "slug": "llm-evaluation-framework",
        "type": "complement"
      },
      {
        "slug": "guardrails-framework",
        "type": "complement"
      },
      {
        "slug": "prompt-testing",
        "type": "complement"
      }
    ]
  },
  {
    "id": 57,
    "name": "Agent Reliability Patterns",
    "name_zh": "智能体可靠性模式",
    "slug": "agent-reliability-patterns",
    "category": "quality",
    "desc": "Patterns ensuring AI agents behave predictably in production",
    "desc_zh": "确保AI智能体在生产环境中行为可预测的模式集合",
    "steps": [
      "Implement bounded execution: set max step limits, wall-clock timeouts, and token budgets to prevent runaway agent loops",
      "Add checkpoint and resume: persist agent state at key decision points so failed runs can be retried from the last checkpoint",
      "Design deterministic fallbacks: when an agent step fails or times out, fall back to a simpler strategy or escalate to a human",
      "Use idempotent tool calls: ensure all tools the agent invokes can be safely retried, with deduplication keys preventing duplicate side effects",
      "Monitor agent behavior: track step counts, tool call success rates, loop detection, and cost per task to identify reliability regressions"
    ],
    "steps_zh": [
      "实现有界执行：设置最大步数限制、挂钟超时和Token预算以防止智能体失控循环",
      "添加检查点与恢复：在关键决策点持久化智能体状态，使失败的运行可从最后检查点重试",
      "设计确定性降级：当智能体步骤失败或超时时，降级到更简单的策略或升级给人类处理",
      "使用幂等工具调用：确保智能体调用的所有工具可安全重试，用去重键防止重复副作用",
      "监控智能体行为：追踪步数、工具调用成功率、循环检测和每任务成本以识别可靠性回归"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Plan",
      "Execute",
      "Retry",
      "Fallback"
    ],
    "viz_labels_zh": [
      "规划",
      "执行",
      "重试",
      "回退"
    ],
    "related": [
      "circuit-breaker-pattern",
      "agent-deployment-patterns",
      "chaos-engineering"
    ],
    "tags": [
      "agent-reliability",
      "bounded-execution",
      "idempotency",
      "checkpointing",
      "ai"
    ],
    "origin_author": "Anthropic / LangChain / OpenAI, 2023",
    "origin_source": "Building Reliable AI Agents (industry practices from Anthropic, LangChain, and OpenAI documentation)",
    "origin_source_zh": "《构建可靠AI智能体》（来自Anthropic、LangChain和OpenAI文档的行业实践）",
    "complexity": "advanced",
    "when_to_use": [
      "Deploying autonomous AI agents that perform multi-step tasks with real-world side effects",
      "Building agent systems that interact with external APIs, databases, or file systems",
      "Production environments where agent failures must be contained and recovered from gracefully",
      "Cost-sensitive deployments where runaway agent loops can generate massive API bills"
    ],
    "when_to_use_zh": [
      "部署执行有真实副作用的多步任务的自主AI智能体",
      "构建与外部API、数据库或文件系统交互的智能体系统",
      "智能体故障必须被遏制且优雅恢复的生产环境",
      "失控智能体循环可能产生巨额API账单的成本敏感部署"
    ],
    "core_concepts": [
      "Bounded Execution: Hard limits on step count, time, and token usage to prevent infinite agent loops",
      "Checkpointing: Persisting agent state at decision points to enable resume-from-failure without restarting",
      "Idempotent Tool Calls: Designing tools so that retrying the same call produces the same result without duplicate side effects",
      "Deterministic Fallbacks: Pre-defined simpler strategies activated when the primary agent path fails",
      "Loop Detection: Monitoring for repetitive agent behaviors that indicate the agent is stuck in an unproductive cycle"
    ],
    "core_concepts_zh": [
      "有界执行：对步数、时间和Token使用设置硬限制以防止无限智能体循环",
      "检查点：在决策点持久化智能体状态以实现从故障处恢复而无需重启",
      "幂等工具调用：设计工具使重试同一调用产生相同结果而无重复副作用",
      "确定性降级：当主要智能体路径失败时激活的预定义更简单策略",
      "循环检测：监控指示智能体陷入无效循环的重复行为"
    ],
    "timeline": [
      [
        "2022",
        "Early LLM agents (AutoGPT, BabyAGI) demonstrate both potential and reliability challenges of autonomous AI"
      ],
      [
        "2023",
        "LangChain and LlamaIndex formalize agent architectures with tool use, planning, and memory patterns"
      ],
      [
        "2024",
        "Anthropic publishes best practices for building reliable Claude-based agents with bounded execution"
      ],
      [
        "2024",
        "OpenAI Assistants API and function calling introduce structured tool use with built-in safety mechanisms"
      ],
      [
        "2025",
        "Agent reliability patterns mature into production-grade frameworks with observability, cost controls, and human escalation"
      ]
    ],
    "timeline_zh": [
      [
        "2022",
        "早期LLM智能体（AutoGPT、BabyAGI）展示了自主AI的潜力和可靠性挑战"
      ],
      [
        "2023",
        "LangChain和LlamaIndex正式化了包含工具使用、规划和记忆模式的智能体架构"
      ],
      [
        "2024",
        "Anthropic发布构建可靠Claude智能体的最佳实践，包含有界执行"
      ],
      [
        "2024",
        "OpenAI Assistants API和函数调用引入了带内置安全机制的结构化工具使用"
      ],
      [
        "2025",
        "智能体可靠性模式成熟为生产级框架，包含可观测性、成本控制和人工升级"
      ]
    ],
    "dos": [
      "Do set hard budget limits (steps, tokens, time) because unconstrained agents will eventually enter infinite loops",
      "Do make all tool calls idempotent because agent retries are inevitable and must not cause duplicate actions",
      "Do implement human escalation paths because agents will encounter situations beyond their reliable capability",
      "Do log every agent decision and tool call because debugging agent failures requires full execution traces"
    ],
    "dos_zh": [
      "设置硬预算限制（步数、Token、时间），因为不受约束的智能体最终会进入无限循环",
      "使所有工具调用幂等，因为智能体重试不可避免且不能导致重复操作",
      "实现人工升级路径，因为智能体会遇到超出其可靠能力的情况",
      "记录每个智能体决策和工具调用，因为调试智能体故障需要完整的执行跟踪"
    ],
    "donts": [
      "Don't give agents unlimited tool access because broad permissions amplify the impact of agent errors",
      "Don't skip loop detection because a stuck agent wastes resources and may cause unintended side effects",
      "Don't deploy agents without monitoring because silent failures accumulate cost and erode user trust",
      "Don't assume agent behavior is deterministic because LLM-based agents can take different paths on identical inputs"
    ],
    "donts_zh": [
      "不要给智能体无限制的工具访问权限，因为广泛权限会放大智能体错误的影响",
      "不要跳过循环检测，因为陷入循环的智能体浪费资源且可能造成意外副作用",
      "不要在没有监控的情况下部署智能体，因为静默失败会累积成本并侵蚀用户信任",
      "不要假设智能体行为是确定性的，因为基于LLM的智能体在相同输入上可能走不同路径"
    ],
    "case_study_company": "Anthropic",
    "case_study": "Anthropic developed agent reliability patterns while building Claude's tool-use capabilities, discovering that bounded execution and checkpointing were essential after observing agents entering loops during complex multi-step tasks. Their internal agent framework enforces token budgets, implements retry-with-backoff for tool calls, and includes automatic human escalation when confidence drops below thresholds.",
    "case_study_zh": "Anthropic在构建Claude的工具使用能力时开发了智能体可靠性模式，在观察到智能体在复杂多步任务中进入循环后，发现有界执行和检查点是必要的。他们的内部智能体框架强制Token预算，为工具调用实现退避重试，并在信心低于阈值时自动升级给人类。",
    "when_not_to_use": [
      "Simple single-turn LLM calls that do not involve tool use or multi-step reasoning",
      "Fully supervised human-in-the-loop workflows where every agent action requires approval",
      "Offline batch inference where failures can be retried without real-time reliability concerns"
    ],
    "when_not_to_use_zh": [
      "不涉及工具使用或多步推理的简单单轮LLM调用",
      "每个智能体动作都需要审批的完全监督人在回路工作流",
      "失败可重试且无实时可靠性顾虑的离线批量推理"
    ],
    "adopters": [
      "Anthropic",
      "OpenAI",
      "Google DeepMind",
      "LangChain",
      "Replit"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Anthropic (2024). \"Building Effective Agents\". anthropic.com.",
    "secondary_sources": [
      "Harrison Chase et al. (2023). \"LangChain: Building Context-Aware Reasoning Applications\". langchain.com.",
      "OpenAI (2023). \"GPT Best Practices: Strategy for Reliable Outputs\". platform.openai.com."
    ],
    "typed_relations": [
      {
        "slug": "circuit-breaker-pattern",
        "type": "extends"
      },
      {
        "slug": "agent-deployment-patterns",
        "type": "complement"
      },
      {
        "slug": "chaos-engineering",
        "type": "complement"
      }
    ]
  },
  {
    "id": 58,
    "name": "Prompt Testing",
    "name_zh": "提示词测试",
    "slug": "prompt-testing",
    "category": "quality",
    "desc": "Automated regression testing for LLM prompt changes",
    "desc_zh": "对大模型提示词变更进行自动化回归测试",
    "steps": [
      "Build a test suite: create a collection of input-expected output pairs that cover normal cases, edge cases, and known failure modes",
      "Define assertions: use exact match, contains, regex, semantic similarity, or LLM-as-judge to verify each output meets expectations",
      "Version prompts as code: store prompts in version control alongside their test suites so every change is tracked and reviewable",
      "Run in CI: execute prompt tests on every PR that modifies prompts; fail the build if quality scores drop below thresholds",
      "Track quality over time: log evaluation scores per prompt version to detect gradual drift and enable data-driven prompt optimization"
    ],
    "steps_zh": [
      "构建测试套件：创建覆盖正常情况、边界情况和已知故障模式的输入-期望输出对集合",
      "定义断言：使用精确匹配、包含、正则、语义相似度或LLM作为裁判来验证每个输出是否符合预期",
      "将提示词作为代码版本化：将提示词及其测试套件存入版本控制，使每次变更可追踪和审查",
      "在CI中运行：对每个修改提示词的PR执行提示词测试；质量分数低于阈值时构建失败",
      "追踪质量趋势：记录每个提示词版本的评估分数以检测渐进漂移，实现数据驱动的提示词优化"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Write",
      "Evaluate",
      "Refine",
      "Regression"
    ],
    "viz_labels_zh": [
      "编写提示",
      "评估输出",
      "优化提示",
      "回归测试"
    ],
    "related": [
      "llm-evaluation-framework",
      "ai-output-verification",
      "prompt-engineering-patterns"
    ],
    "tags": [
      "prompt-testing",
      "regression",
      "ci-cd",
      "evaluation",
      "ai"
    ],
    "origin_author": "Promptfoo (Ian Webster), 2023",
    "origin_source": "Promptfoo: Test Your LLM App (open-source project and documentation)",
    "origin_source_zh": "《Promptfoo：测试你的LLM应用》（开源项目及文档）",
    "complexity": "intermediate",
    "when_to_use": [
      "Teams iterating on prompts frequently and needing confidence that changes do not break existing behavior",
      "Production LLM applications where prompt regressions directly impact user experience",
      "Multi-prompt pipelines where changes to one prompt can cascade and affect downstream outputs",
      "Organizations building prompt engineering as a disciplined, measurable practice"
    ],
    "when_to_use_zh": [
      "频繁迭代提示词且需要确信变更不会破坏现有行为的团队",
      "提示词回归直接影响用户体验的生产LLM应用",
      "一个提示词的变更可能级联影响下游输出的多提示词管道",
      "将提示词工程建设为有纪律、可衡量实践的组织"
    ],
    "core_concepts": [
      "Prompt Versioning: Tracking prompt text in version control like code to enable diff, review, and rollback",
      "Assertion Types: Multiple validation methods including exact match, regex, semantic similarity, and LLM-as-judge",
      "Regression Suite: A collection of test cases that must pass before any prompt change is deployed",
      "Quality Scores: Quantitative metrics tracking prompt performance over time across multiple dimensions",
      "A/B Evaluation: Comparing two prompt versions against the same test cases to measure relative improvement"
    ],
    "core_concepts_zh": [
      "提示词版本化：像代码一样在版本控制中跟踪提示词文本，以实现差异对比、审查和回滚",
      "断言类型：包括精确匹配、正则、语义相似度和LLM作为裁判的多种验证方法",
      "回归套件：在部署任何提示词变更前必须通过的测试用例集合",
      "质量分数：跨多个维度追踪提示词性能随时间变化的定量指标",
      "A/B评估：将两个提示词版本对同一测试用例进行比较以衡量相对改进"
    ],
    "timeline": [
      [
        "2022",
        "Early prompt engineering practitioners begin manually testing prompts with spreadsheets and scripts"
      ],
      [
        "2023",
        "Promptfoo releases as an open-source CLI tool for systematic prompt evaluation and comparison"
      ],
      [
        "2023",
        "Langsmith and Braintrust launch cloud platforms for prompt testing with built-in analytics"
      ],
      [
        "2024",
        "Prompt testing integrates into standard CI/CD pipelines alongside traditional software tests"
      ],
      [
        "2025",
        "Prompt testing tools mature to support multi-turn agent evaluation and model-agnostic testing"
      ]
    ],
    "timeline_zh": [
      [
        "2022",
        "早期提示词工程实践者开始用电子表格和脚本手动测试提示词"
      ],
      [
        "2023",
        "Promptfoo作为用于系统化提示词评估和比较的开源CLI工具发布"
      ],
      [
        "2023",
        "Langsmith和Braintrust推出带内置分析的提示词测试云平台"
      ],
      [
        "2024",
        "提示词测试与传统软件测试一起集成到标准CI/CD管道中"
      ],
      [
        "2025",
        "提示词测试工具成熟到支持多轮智能体评估和模型无关测试"
      ]
    ],
    "dos": [
      "Do version prompts alongside their tests because untested prompt changes are the leading cause of LLM regressions",
      "Do use multiple assertion types because no single check captures all quality dimensions of LLM output",
      "Do include adversarial and edge case inputs because prompts often fail on inputs the author did not consider",
      "Do track scores over time because gradual drift is harder to notice than sudden regression"
    ],
    "dos_zh": [
      "将提示词与其测试一起版本化，因为未测试的提示词变更是LLM回归的首要原因",
      "使用多种断言类型，因为没有单一检查能捕获LLM输出的所有质量维度",
      "包含对抗性和边界输入，因为提示词经常在作者未考虑的输入上失败",
      "追踪分数随时间的变化，因为渐进漂移比突然回归更难察觉"
    ],
    "donts": [
      "Don't change prompts in production without running regression tests because seemingly minor edits can cause major output changes",
      "Don't test only with ideal inputs because production users will provide messy, ambiguous, and adversarial inputs",
      "Don't ignore non-determinism because the same prompt can produce different outputs across runs",
      "Don't rely solely on exact match assertions because LLM outputs are naturally variable and semantically equivalent answers may differ in phrasing"
    ],
    "donts_zh": [
      "不要在未运行回归测试的情况下在生产中修改提示词，因为看似微小的编辑可能导致重大输出变化",
      "不要仅用理想输入测试，因为生产用户会提供混乱、模糊和对抗性的输入",
      "不要忽略非确定性，因为同一提示词在不同运行中可能产生不同输出",
      "不要仅依赖精确匹配断言，因为LLM输出自然具有变异性，语义等价的答案可能措辞不同"
    ],
    "case_study_company": "Shopify",
    "case_study": "Shopify implemented prompt testing for their AI-powered product description generator that serves millions of merchants. After a prompt change inadvertently degraded output quality for non-English products, they built a comprehensive regression suite with 500+ test cases across 12 languages. This caught three subsequent regressions before they reached production, saving significant merchant-facing impact.",
    "case_study_zh": "Shopify为其服务数百万商家的AI产品描述生成器实施了提示词测试。在一次提示词变更意外降低了非英语产品的输出质量后，他们构建了一个包含12种语言500多个测试用例的综合回归套件。这在后续三次回归到达生产之前就将其捕获，避免了对商家的重大影响。",
    "when_not_to_use": [
      "One-off prompt usage where the prompt is not maintained or reused over time",
      "Creative applications where output variability is desired and correctness is subjective",
      "Very early experimentation where prompts change so rapidly that maintaining tests is counterproductive"
    ],
    "when_not_to_use_zh": [
      "提示词不会长期维护或复用的一次性使用",
      "输出变异性是期望的且正确性是主观的创意应用",
      "提示词变化极快以至于维护测试适得其反的早期实验"
    ],
    "adopters": [
      "Shopify",
      "Vercel",
      "Notion",
      "Stripe",
      "GitHub Copilot"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "testability",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Ian Webster (2023). \"Promptfoo: Test Your LLM App\". promptfoo.dev.",
    "secondary_sources": [
      "Braintrust (2023). \"Braintrust: Enterprise-Grade LLM Evaluation\". braintrustdata.com.",
      "Percy Liang et al. (2022). \"Holistic Evaluation of Language Models (HELM)\". Stanford CRFM."
    ],
    "typed_relations": [
      {
        "slug": "llm-evaluation-framework",
        "type": "complement"
      },
      {
        "slug": "ai-output-verification",
        "type": "complement"
      },
      {
        "slug": "prompt-engineering-patterns",
        "type": "complement"
      }
    ]
  },
  {
    "id": 178,
    "name": "Mutation Testing",
    "name_zh": "变异测试",
    "slug": "mutation-testing",
    "category": "quality",
    "desc": "Test the tests by introducing code mutations and verifying that tests catch them",
    "desc_zh": "通过引入代码变异来测试测试本身，验证测试能否捕获这些变异",
    "steps": [
      "Select a mutation testing tool appropriate for your language: Stryker for JS/TS, PIT for Java, mutmut for Python, or similar",
      "Run the mutation tool against your codebase: it generates mutants by applying small syntactic changes (flip operators, remove calls, change constants)",
      "Analyze the mutation score: the percentage of mutants killed by your existing tests — surviving mutants indicate gaps in test quality",
      "Strengthen tests to kill surviving mutants: write targeted assertions that cover the specific logic the mutant altered",
      "Integrate mutation testing into CI as a quality gate: set a minimum mutation score threshold and fail builds that drop below it"
    ],
    "steps_zh": [
      "选择适合你语言的变异测试工具：JS/TS用Stryker，Java用PIT，Python用mutmut等",
      "对代码库运行变异工具：通过应用小型语法变更（翻转运算符、移除调用、更改常量）生成变异体",
      "分析变异分数：现有测试杀死的变异体百分比——存活的变异体表明测试质量存在缺口",
      "加强测试以杀死存活的变异体：编写针对变异体所改变的特定逻辑的定向断言",
      "将变异测试集成到CI中作为质量门禁：设置最低变异分数阈值，低于阈值则构建失败"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Mutate",
      "Run Tests",
      "Detect",
      "Score"
    ],
    "viz_labels_zh": [
      "变异代码",
      "运行测试",
      "检测",
      "评分"
    ],
    "related": [
      "test-pyramid",
      "tdd",
      "property-based-testing"
    ],
    "tags": [
      "testing",
      "mutation-testing",
      "test-quality",
      "code-coverage",
      "quality-gate"
    ],
    "origin_author": "Richard Lipton, 1971",
    "origin_source": "Release It! Design and Deploy Production-Ready Software (Nygard, 2018)",
    "origin_source_zh": "《发布！设计与部署生产就绪软件》（Nygard，2018）",
    "complexity": "intermediate",
    "when_to_use": [
      "After achieving high code coverage but still experiencing production bugs that tests should have caught",
      "When evaluating whether a test suite provides genuine confidence or merely exercises code paths without meaningful assertions",
      "During quality audits to quantify the real effectiveness of existing tests beyond line coverage metrics",
      "When building safety-critical or financial systems where test thoroughness directly impacts risk"
    ],
    "when_to_use_zh": [
      "在达到高代码覆盖率但仍然遇到测试本应捕获的生产缺陷时",
      "评估测试套件是否提供真正的信心，还是仅执行代码路径而没有有意义的断言时",
      "在质量审计中量化现有测试超越行覆盖率指标的真实有效性时",
      "构建安全关键或金融系统时，测试的彻底性直接影响风险"
    ],
    "core_concepts": [
      "Mutant: A small syntactic change to the source code (e.g., replacing + with -, flipping a boolean, removing a method call)",
      "Mutation Score: The ratio of killed mutants to total mutants, measuring how effectively the test suite detects faults",
      "Killed Mutant: A mutant that causes at least one test to fail, proving the test suite detects that specific fault",
      "Survived Mutant: A mutant that passes all tests, revealing a gap in test coverage or assertion quality",
      "Equivalent Mutant: A mutant that produces identical behavior to the original code, which cannot be killed and must be excluded from scoring"
    ],
    "core_concepts_zh": [
      "变异体：对源代码的小型语法更改（例如将+替换为-、翻转布尔值、移除方法调用）",
      "变异分数：被杀死的变异体与总变异体的比率，衡量测试套件检测故障的有效性",
      "被杀死的变异体：导致至少一个测试失败的变异体，证明测试套件能检测该特定故障",
      "存活的变异体：通过所有测试的变异体，揭示测试覆盖或断言质量的缺口",
      "等价变异体：产生与原始代码相同行为的变异体，无法被杀死，必须从评分中排除"
    ],
    "timeline": [
      [
        "1971",
        "Richard Lipton proposes mutation analysis as a theoretical framework for evaluating test adequacy"
      ],
      [
        "1980",
        "DeMillo, Lipton, and Sayward publish foundational paper establishing mutation testing as a practical technique"
      ],
      [
        "2010",
        "PIT (Pitest) launches for Java, making mutation testing practical for real-world projects"
      ],
      [
        "2017",
        "Stryker Mutator releases for JavaScript/TypeScript, bringing mutation testing to the frontend ecosystem"
      ],
      [
        "2019",
        "Mutation testing gains mainstream attention as CI tooling matures and execution speed improves dramatically"
      ]
    ],
    "timeline_zh": [
      [
        "1971",
        "Richard Lipton提出变异分析作为评估测试充分性的理论框架"
      ],
      [
        "1980",
        "DeMillo、Lipton和Sayward发表奠基性论文，将变异测试确立为实用技术"
      ],
      [
        "2010",
        "PIT（Pitest）为Java推出，使变异测试在实际项目中变得可行"
      ],
      [
        "2017",
        "Stryker Mutator为JavaScript/TypeScript发布，将变异测试带入前端生态"
      ],
      [
        "2019",
        "随着CI工具成熟和执行速度大幅提升，变异测试获得主流关注"
      ]
    ],
    "dos": [
      "Do start with critical business logic modules because mutation testing on the entire codebase can be prohibitively slow",
      "Do use mutation score as a complement to coverage because high coverage with low mutation score reveals weak assertions",
      "Do configure mutant operators relevant to your domain because not all mutation types apply equally to every codebase",
      "Do run mutation tests incrementally on changed files because full runs are expensive and slow feedback loops"
    ],
    "dos_zh": [
      "从关键业务逻辑模块开始，因为对整个代码库进行变异测试可能慢得令人望而却步",
      "将变异分数作为覆盖率的补充，因为高覆盖率低变异分数揭示了弱断言",
      "配置与你领域相关的变异运算符，因为不是所有变异类型都同等适用于每个代码库",
      "对变更文件增量运行变异测试，因为完整运行代价高昂且减慢反馈循环"
    ],
    "donts": [
      "Don't chase 100% mutation score because equivalent mutants and diminishing returns make it impractical and wasteful",
      "Don't run mutation tests on generated code or boilerplate because mutations there rarely reveal meaningful test gaps",
      "Don't treat mutation testing as a replacement for code review because it tests mechanical correctness not design quality",
      "Don't ignore performance implications because mutation testing can multiply test execution time by 10x-100x"
    ],
    "donts_zh": [
      "不要追求100%变异分数，因为等价变异体和收益递减使其不切实际且浪费资源",
      "不要对生成的代码或样板代码运行变异测试，因为那里的变异很少揭示有意义的测试缺口",
      "不要将变异测试视为代码审查的替代品，因为它测试的是机械正确性而非设计质量",
      "不要忽视性能影响，因为变异测试可以将测试执行时间放大10到100倍"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix applied mutation testing to their critical payment processing and subscription management services. After achieving 90% line coverage, they discovered a mutation score of only 62%, revealing that many tests were exercising code without asserting meaningful outcomes. By systematically killing surviving mutants, they improved fault detection and reduced payment-related production incidents by 35% over two quarters.",
    "case_study_zh": "Netflix将变异测试应用于其关键的支付处理和订阅管理服务。在达到90%的行覆盖率后，他们发现变异分数仅为62%，揭示了许多测试在执行代码时没有断言有意义的结果。通过系统性地杀死存活的变异体，他们提高了故障检测能力，在两个季度内将支付相关的生产事故减少了35%。",
    "when_not_to_use": [
      "Rapid prototyping phases where code changes too fast for mutation analysis to provide actionable feedback",
      "Legacy codebases with minimal existing tests where writing basic tests should take priority over mutation analysis",
      "Performance-sensitive CI pipelines where mutation testing overhead would unacceptably delay feedback loops"
    ],
    "when_not_to_use_zh": [
      "快速原型阶段，代码变化太快，变异分析无法提供可操作的反馈",
      "现有测试极少的遗留代码库，编写基本测试应优先于变异分析",
      "对性能敏感的CI管道，变异测试开销会不可接受地延迟反馈循环"
    ],
    "adopters": [
      "Netflix",
      "Airbnb",
      "ING Bank",
      "Sky",
      "Info Support"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "testability"
    ],
    "maturity_ring": "established",
    "primary_source": "Richard Lipton (1971). \"Fault Diagnosis of Computer Programs\". PhD Dissertation, Carnegie Mellon University.",
    "secondary_sources": [
      "Yue Jia and Mark Harman (2011). \"An Analysis and Survey of the Development of Mutation Testing\". IEEE Transactions on Software Engineering, 37(5).",
      "Henry Coles (2010). \"PIT Mutation Testing\". pitest.org."
    ],
    "typed_relations": [
      {
        "slug": "test-pyramid",
        "type": "complement"
      },
      {
        "slug": "tdd",
        "type": "complement"
      },
      {
        "slug": "property-based-testing",
        "type": "complement"
      }
    ]
  },
  {
    "id": 179,
    "name": "Snapshot Testing",
    "name_zh": "快照测试",
    "slug": "snapshot-testing",
    "category": "quality",
    "desc": "Capture output snapshots for regression detection by comparing current output against stored baselines",
    "desc_zh": "通过将当前输出与存储的基线进行比较，捕获输出快照以检测回归",
    "steps": [
      "Identify outputs suitable for snapshotting: rendered UI components, serialized data structures, API responses, or CLI outputs",
      "Generate initial snapshots: run the test suite to capture baseline outputs and commit the snapshot files to version control",
      "Write snapshot assertions in tests: compare current output against the stored snapshot and fail on any diff",
      "Review snapshot diffs carefully during code review: treat snapshot updates as production changes that require explicit approval",
      "Update snapshots intentionally when changes are expected: use the update flag deliberately, never blindly accept all snapshot changes"
    ],
    "steps_zh": [
      "识别适合快照的输出：渲染的UI组件、序列化数据结构、API响应或CLI输出",
      "生成初始快照：运行测试套件以捕获基线输出并将快照文件提交到版本控制",
      "在测试中编写快照断言：将当前输出与存储的快照进行比较，有差异则失败",
      "在代码审查中仔细审查快照差异：将快照更新视为需要明确批准的生产变更",
      "在预期变更时有意更新快照：刻意使用更新标志，永远不要盲目接受所有快照变更"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Render",
      "Snapshot",
      "Compare",
      "Update"
    ],
    "viz_labels_zh": [
      "渲染",
      "快照",
      "对比",
      "更新"
    ],
    "related": [
      "test-pyramid",
      "mutation-testing"
    ],
    "tags": [
      "testing",
      "snapshot-testing",
      "regression",
      "ui-testing",
      "baseline"
    ],
    "origin_author": "Jest team at Facebook, 2016",
    "origin_source": "Continuous Delivery: Reliable Software Releases through Build, Test, and Deployment Automation (Humble & Farley, 2010)",
    "origin_source_zh": "《持续交付：通过构建、测试和部署自动化可靠发布软件》（Humble & Farley，2010）",
    "complexity": "beginner",
    "when_to_use": [
      "Testing UI component rendering to catch unintended visual or structural changes across releases",
      "Validating serialized data formats like JSON or XML outputs where exact structure matters",
      "Detecting regressions in CLI tool output where users depend on stable formatting",
      "Guarding API response schemas against accidental changes that could break downstream consumers"
    ],
    "when_to_use_zh": [
      "测试UI组件渲染以捕获跨版本的意外视觉或结构变化",
      "验证序列化数据格式（如JSON或XML输出），其中精确结构很重要",
      "检测CLI工具输出中的回归，用户依赖稳定的格式",
      "防止API响应模式的意外变更，这些变更可能破坏下游消费者"
    ],
    "core_concepts": [
      "Snapshot File: A stored representation of expected output, committed to version control alongside test code",
      "Snapshot Diff: The comparison between current output and the stored baseline, highlighting any changes",
      "Inline Snapshot: A snapshot stored directly within the test file rather than in a separate snapshot file",
      "Snapshot Update: The deliberate act of regenerating baselines when intentional changes are made to output",
      "Snapshot Bloat: The accumulation of large or orphaned snapshot files that slow tests and obscure meaningful diffs"
    ],
    "core_concepts_zh": [
      "快照文件：预期输出的存储表示，与测试代码一起提交到版本控制",
      "快照差异：当前输出与存储基线之间的比较，突出显示任何变化",
      "内联快照：直接存储在测试文件中而非单独快照文件中的快照",
      "快照更新：当对输出进行有意更改时，刻意重新生成基线的行为",
      "快照膨胀：大型或孤立快照文件的累积，减慢测试速度并模糊有意义的差异"
    ],
    "timeline": [
      [
        "2010",
        "Humble and Farley advocate baseline comparison strategies for deployment validation in Continuous Delivery"
      ],
      [
        "2016",
        "Jest introduces built-in snapshot testing for React components, popularizing the technique widely"
      ],
      [
        "2018",
        "Snapshot testing extends beyond UI to API contracts, configuration files, and data serialization"
      ],
      [
        "2020",
        "Inline snapshots gain adoption, reducing file sprawl and improving test readability"
      ],
      [
        "2023",
        "Snapshot testing adapts to AI-generated content, capturing LLM output baselines for regression detection"
      ]
    ],
    "timeline_zh": [
      [
        "2010",
        "Humble和Farley在《持续交付》中倡导基线比较策略用于部署验证"
      ],
      [
        "2016",
        "Jest为React组件引入内置快照测试，广泛普及了该技术"
      ],
      [
        "2018",
        "快照测试从UI扩展到API契约、配置文件和数据序列化"
      ],
      [
        "2020",
        "内联快照获得采用，减少文件蔓延并提高测试可读性"
      ],
      [
        "2023",
        "快照测试适应AI生成内容，捕获LLM输出基线以进行回归检测"
      ]
    ],
    "dos": [
      "Do review snapshot diffs with the same rigor as code changes because unreviewed snapshot updates hide regressions",
      "Do keep snapshots small and focused because large snapshots make diffs unreadable and reviews perfunctory",
      "Do use inline snapshots for small outputs because they keep the expected value co-located with the test logic",
      "Do delete orphaned snapshot files because stale snapshots waste CI time and confuse new team members"
    ],
    "dos_zh": [
      "以与代码变更相同的严格程度审查快照差异，因为未经审查的快照更新会隐藏回归",
      "保持快照小而集中，因为大型快照使差异不可读且审查流于形式",
      "对小型输出使用内联快照，因为它们将预期值与测试逻辑放在一起",
      "删除孤立的快照文件，因为过时的快照浪费CI时间并困扰新团队成员"
    ],
    "donts": [
      "Don't blindly update all snapshots when tests fail because this defeats the purpose of regression detection entirely",
      "Don't snapshot non-deterministic output like timestamps or random IDs because they cause false failures on every run",
      "Don't use snapshots as a substitute for specific assertions because snapshots test shape not behavior",
      "Don't let snapshot files grow unbounded because massive snapshots slow down test execution and version control operations"
    ],
    "donts_zh": [
      "不要在测试失败时盲目更新所有快照，因为这完全违背了回归检测的目的",
      "不要快照非确定性输出（如时间戳或随机ID），因为它们每次运行都会导致误报",
      "不要用快照替代具体断言，因为快照测试的是结构而非行为",
      "不要让快照文件无限增长，因为巨大的快照会拖慢测试执行和版本控制操作"
    ],
    "case_study_company": "Facebook",
    "case_study": "Facebook developed snapshot testing within Jest to manage the rapid pace of React component changes across thousands of engineers. Before snapshots, UI regressions frequently slipped through code review because reviewers could not visualize rendering changes from code diffs alone. After adopting snapshot testing, unintended UI changes became visible in pull request diffs, reducing UI regression reports by 40% and significantly improving code review quality for frontend changes.",
    "case_study_zh": "Facebook在Jest中开发了快照测试，以管理数千名工程师快速变化的React组件。在引入快照之前，UI回归经常在代码审查中遗漏，因为审查者无法仅从代码差异中可视化渲染变化。采用快照测试后，意外的UI变化在拉取请求差异中变得可见，将UI回归报告减少了40%，并显著提高了前端变更的代码审查质量。",
    "when_not_to_use": [
      "Highly dynamic outputs where the content changes frequently by design, making snapshots constantly stale",
      "Complex stateful interactions where behavior matters more than output shape and snapshot diffs are misleading",
      "Early-stage prototyping where UI and data formats are changing rapidly and snapshot maintenance creates drag"
    ],
    "when_not_to_use_zh": [
      "设计上内容频繁变化的高度动态输出，使快照持续过时",
      "复杂的有状态交互，行为比输出结构更重要，快照差异具有误导性",
      "UI和数据格式快速变化的早期原型阶段，快照维护造成拖累"
    ],
    "adopters": [
      "Facebook",
      "Airbnb",
      "Shopify",
      "Stripe",
      "Vercel"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "testability"
    ],
    "maturity_ring": "established",
    "primary_source": "Facebook/Meta (2016). \"Jest Snapshot Testing\". jestjs.io.",
    "secondary_sources": [
      "Christoph Nakazawa (2016). \"Snapshot Testing in Jest\". jestjs.io/blog.",
      "Kent C. Dodds (2018). \"Effective Snapshot Testing\". kentcdodds.com."
    ],
    "typed_relations": [
      {
        "slug": "test-pyramid",
        "type": "complement"
      },
      {
        "slug": "mutation-testing",
        "type": "related"
      }
    ]
  },
  {
    "id": 180,
    "name": "Load Testing Patterns",
    "name_zh": "负载测试模式",
    "slug": "load-testing-patterns",
    "category": "quality",
    "desc": "Stress, spike, soak testing methodologies to validate system behavior under varying load conditions",
    "desc_zh": "压力测试、尖峰测试、浸泡测试方法论，验证系统在不同负载条件下的行为",
    "steps": [
      "Define performance baselines: establish normal traffic patterns, acceptable response times, throughput targets, and error rate thresholds",
      "Design load test scenarios: create stress tests (gradual ramp to breaking point), spike tests (sudden traffic surges), and soak tests (sustained load over hours)",
      "Build realistic test scripts: use production traffic patterns, representative data sets, and proper think times to avoid artificial load profiles",
      "Execute tests in a production-like environment: match infrastructure, data volume, and network topology to avoid misleading results",
      "Analyze results and establish regression gates: compare against baselines, identify bottlenecks, and integrate performance thresholds into CI/CD"
    ],
    "steps_zh": [
      "定义性能基线：建立正常流量模式、可接受的响应时间、吞吐量目标和错误率阈值",
      "设计负载测试场景：创建压力测试（逐步增加到断裂点）、尖峰测试（突然流量激增）和浸泡测试（持续数小时的负载）",
      "构建真实的测试脚本：使用生产流量模式、代表性数据集和适当的思考时间，避免人为的负载配置",
      "在类生产环境中执行测试：匹配基础设施、数据量和网络拓扑以避免误导性结果",
      "分析结果并建立回归门禁：与基线比较、识别瓶颈，并将性能阈值集成到CI/CD中"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Load Test",
      "Stress Test",
      "Spike Test",
      "Soak Test"
    ],
    "viz_labels_zh": [
      "负载测试",
      "压力测试",
      "峰值测试",
      "浸泡测试"
    ],
    "related": [
      "chaos-engineering",
      "sli-slo-sla",
      "circuit-breaker-pattern"
    ],
    "tags": [
      "testing",
      "load-testing",
      "performance",
      "stress-testing",
      "soak-testing"
    ],
    "origin_author": "Michael Nygard, 2007",
    "origin_source": "Release It! Design and Deploy Production-Ready Software (Nygard, 2018)",
    "origin_source_zh": "《发布！设计与部署生产就绪软件》（Nygard，2018）",
    "complexity": "intermediate",
    "when_to_use": [
      "Before major launches or marketing events that will significantly increase traffic volume",
      "After architectural changes like database migrations, service splits, or infrastructure moves",
      "When establishing SLAs or SLOs that require empirical evidence of system capacity",
      "During capacity planning to determine scaling limits and cost projections for growth"
    ],
    "when_to_use_zh": [
      "在将显著增加流量的重大发布或营销活动之前",
      "在架构变更（如数据库迁移、服务拆分或基础设施迁移）之后",
      "在建立需要系统容量经验证据的SLA或SLO时",
      "在容量规划期间确定扩展限制和增长成本预测时"
    ],
    "core_concepts": [
      "Stress Testing: Gradually increasing load beyond normal capacity to find the system's breaking point and failure modes",
      "Spike Testing: Applying sudden, extreme load increases to test auto-scaling, queuing, and graceful degradation under bursts",
      "Soak Testing: Running sustained load for extended periods to detect memory leaks, connection pool exhaustion, and resource degradation",
      "Baseline Profile: The established normal performance characteristics (latency, throughput, error rate) used as comparison benchmarks",
      "Think Time: Realistic pauses between user actions in test scripts that prevent artificial concurrency inflation"
    ],
    "core_concepts_zh": [
      "压力测试：逐步增加超出正常容量的负载，找到系统的断裂点和故障模式",
      "尖峰测试：施加突然的极端负载增加，测试自动扩展、排队和突发情况下的优雅降级",
      "浸泡测试：长时间运行持续负载以检测内存泄漏、连接池耗尽和资源退化",
      "基线配置：作为比较基准的已建立的正常性能特征（延迟、吞吐量、错误率）",
      "思考时间：测试脚本中用户操作之间的真实停顿，防止人为的并发膨胀"
    ],
    "timeline": [
      [
        "2007",
        "Nygard's Release It! establishes load testing as essential for production-ready software"
      ],
      [
        "2011",
        "Apache JMeter becomes the dominant open-source load testing tool for enterprise applications"
      ],
      [
        "2014",
        "Gatling introduces code-based load test scripting with detailed HTML reports and Scala DSL"
      ],
      [
        "2017",
        "k6 launches with a developer-friendly JavaScript API, making load tests feel like writing unit tests"
      ],
      [
        "2021",
        "Cloud-native load testing tools integrate with Kubernetes and serverless, enabling distributed test execution at scale"
      ]
    ],
    "timeline_zh": [
      [
        "2007",
        "Nygard的《发布！》将负载测试确立为生产就绪软件的必需"
      ],
      [
        "2011",
        "Apache JMeter成为企业应用最主要的开源负载测试工具"
      ],
      [
        "2014",
        "Gatling引入基于代码的负载测试脚本，提供详细的HTML报告和Scala DSL"
      ],
      [
        "2017",
        "k6以开发者友好的JavaScript API推出，使负载测试感觉像编写单元测试"
      ],
      [
        "2021",
        "云原生负载测试工具与Kubernetes和无服务器集成，实现大规模分布式测试执行"
      ]
    ],
    "dos": [
      "Do use production-like data volumes because testing against empty databases produces dangerously optimistic results",
      "Do include think times in test scripts because removing them creates unrealistic concurrency that masks true capacity",
      "Do test individual services in isolation and the full system together because bottlenecks shift depending on the scope",
      "Do run soak tests for at least 4-8 hours because memory leaks and connection pool issues often take hours to manifest"
    ],
    "dos_zh": [
      "使用类生产数据量，因为对空数据库的测试会产生危险的乐观结果",
      "在测试脚本中包含思考时间，因为移除它们会创建不真实的并发，掩盖真实容量",
      "分别对单个服务和完整系统进行测试，因为瓶颈会根据范围而转移",
      "浸泡测试至少运行4-8小时，因为内存泄漏和连接池问题通常需要数小时才能显现"
    ],
    "donts": [
      "Don't run load tests against production without circuit breakers because uncontrolled load can cause real outages",
      "Don't assume linear scaling because most systems hit non-linear bottlenecks at specific concurrency thresholds",
      "Don't test only happy paths because error handling under load often consumes more resources than normal operations",
      "Don't ignore client-side metrics because server response times alone miss network latency and rendering bottlenecks"
    ],
    "donts_zh": [
      "不要在没有断路器的情况下对生产运行负载测试，因为不受控的负载可能导致真实故障",
      "不要假设线性扩展，因为大多数系统在特定并发阈值处会遇到非线性瓶颈",
      "不要仅测试正常路径，因为负载下的错误处理通常比正常操作消耗更多资源",
      "不要忽视客户端指标，因为仅靠服务器响应时间会遗漏网络延迟和渲染瓶颈"
    ],
    "case_study_company": "Amazon",
    "case_study": "Amazon runs comprehensive load testing before every Prime Day using a combination of stress, spike, and soak tests across their entire microservice fleet. In preparation for Prime Day 2022, they simulated traffic 3x above projected peak, discovering that their recommendation service's connection pool configuration would exhaust under sustained spike load. Fixing this pre-production issue prevented an estimated $12M in lost revenue from degraded product recommendations during the actual event.",
    "case_study_zh": "Amazon在每次Prime Day之前对其整个微服务架构进行全面的负载测试，综合使用压力、尖峰和浸泡测试。在准备2022年Prime Day时，他们模拟了预计峰值3倍的流量，发现推荐服务的连接池配置在持续尖峰负载下会耗尽。在生产前修复此问题避免了实际活动期间因产品推荐降级而造成的估计1200万美元收入损失。",
    "when_not_to_use": [
      "Internal tools with a small, predictable user base where load is never a realistic concern",
      "Early MVPs where functional correctness is the primary risk and performance optimization is premature",
      "Batch processing systems with fixed workloads where throughput is inherently bounded and predictable"
    ],
    "when_not_to_use_zh": [
      "用户群小且可预测的内部工具，负载从来不是现实的关注点",
      "功能正确性是主要风险且性能优化为时过早的早期MVP",
      "具有固定工作负载的批处理系统，吞吐量本质上是有界且可预测的"
    ],
    "adopters": [
      "Amazon",
      "Google",
      "Netflix",
      "Uber",
      "Cloudflare"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "performance"
    ],
    "maturity_ring": "established",
    "primary_source": "Michael Nygard (2007). \"Release It! Design and Deploy Production-Ready Software\". Pragmatic Bookshelf.",
    "secondary_sources": [
      "Michael Nygard (2018). \"Release It! Design and Deploy Production-Ready Software, 2nd Edition\". Pragmatic Bookshelf.",
      "Scott Barber (2004). \"Web Load Testing for Dummies\". Wiley."
    ],
    "typed_relations": [
      {
        "slug": "chaos-engineering",
        "type": "complement"
      },
      {
        "slug": "sli-slo-sla",
        "type": "complement"
      },
      {
        "slug": "circuit-breaker-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 181,
    "name": "Error Handling Patterns",
    "name_zh": "错误处理模式",
    "slug": "error-handling-patterns",
    "category": "quality",
    "desc": "Fail-fast, retry, fallback, and dead letter queue patterns for resilient error management",
    "desc_zh": "快速失败、重试、回退和死信队列模式，用于弹性错误管理",
    "steps": [
      "Classify errors by recoverability: distinguish transient errors (network timeouts, rate limits) from permanent errors (invalid input, missing resources) to select the right handling strategy",
      "Implement fail-fast for unrecoverable errors: validate inputs early, throw immediately on invariant violations, and avoid wasting resources on doomed operations",
      "Add retry with exponential backoff for transient failures: use jitter to prevent thundering herds and set maximum retry limits to avoid infinite loops",
      "Design fallback strategies for degraded operation: serve cached data, return default values, or switch to backup services when primary paths fail",
      "Route persistently failing messages to dead letter queues: capture unprocessable messages for investigation without blocking the main processing pipeline"
    ],
    "steps_zh": [
      "按可恢复性分类错误：区分暂时性错误（网络超时、速率限制）和永久性错误（无效输入、缺失资源）以选择正确的处理策略",
      "对不可恢复错误实施快速失败：尽早验证输入，在违反不变量时立即抛出，避免在注定失败的操作上浪费资源",
      "为暂时性故障添加指数退避重试：使用抖动防止惊群效应，设置最大重试限制避免无限循环",
      "设计降级操作的回退策略：当主路径失败时提供缓存数据、返回默认值或切换到备用服务",
      "将持续失败的消息路由到死信队列：捕获无法处理的消息以供调查，同时不阻塞主处理管道"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Detect",
      "Handle",
      "Recover",
      "Log"
    ],
    "viz_labels_zh": [
      "检测",
      "处理",
      "恢复",
      "记录"
    ],
    "related": [
      "circuit-breaker-pattern",
      "bulkhead-pattern",
      "chaos-engineering"
    ],
    "tags": [
      "resilience",
      "error-handling",
      "retry",
      "fallback",
      "dead-letter-queue"
    ],
    "origin_author": "Michael Nygard, 2007",
    "origin_source": "Release It! Design and Deploy Production-Ready Software (Nygard, 2018)",
    "origin_source_zh": "《发布！设计与部署生产就绪软件》（Nygard，2018）",
    "complexity": "intermediate",
    "when_to_use": [
      "Building distributed systems where network failures, timeouts, and partial outages are inevitable",
      "Designing message processing pipelines where individual message failures should not halt the entire queue",
      "Creating user-facing APIs that must degrade gracefully rather than return raw errors to clients",
      "Operating services with strict SLA requirements where unhandled errors directly impact availability metrics"
    ],
    "when_to_use_zh": [
      "构建分布式系统，其中网络故障、超时和部分中断是不可避免的",
      "设计消息处理管道，其中单条消息失败不应停止整个队列",
      "创建面向用户的API，必须优雅降级而非向客户端返回原始错误",
      "运营有严格SLA要求的服务，未处理的错误直接影响可用性指标"
    ],
    "core_concepts": [
      "Fail-Fast: Immediately reporting an error when a precondition is not met, rather than proceeding with an operation doomed to fail later",
      "Retry with Backoff: Automatically reattempting a failed operation with increasing delays between attempts to allow transient issues to resolve",
      "Fallback: Providing an alternative response or behavior when the primary operation fails, maintaining partial functionality",
      "Dead Letter Queue: A separate queue where messages that cannot be processed after exhausting retries are stored for manual inspection and replay",
      "Idempotency: Designing operations so that retrying them produces the same result, preventing duplicate side effects from retry logic"
    ],
    "core_concepts_zh": [
      "快速失败：当前置条件不满足时立即报告错误，而不是继续注定在后续失败的操作",
      "退避重试：自动重试失败操作，尝试之间的延迟递增，以允许暂时性问题解决",
      "回退：当主操作失败时提供替代响应或行为，维持部分功能",
      "死信队列：一个单独的队列，在重试耗尽后无法处理的消息被存储在此以供手动检查和重放",
      "幂等性：设计操作使重试产生相同结果，防止重试逻辑产生重复副作用"
    ],
    "timeline": [
      [
        "2003",
        "Enterprise Integration Patterns formalizes dead letter channel as a messaging pattern for undeliverable messages"
      ],
      [
        "2007",
        "Nygard's Release It! establishes fail-fast and stability patterns as essential for production systems"
      ],
      [
        "2012",
        "Netflix open-sources Hystrix, popularizing retry, fallback, and circuit breaker patterns in microservices"
      ],
      [
        "2018",
        "Nygard's Release It! second edition expands error handling patterns with cloud-native and container-era practices"
      ],
      [
        "2022",
        "Resilience libraries like Polly (.NET), Resilience4j (Java), and cockatiel (Node) become standard dependencies in production services"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "《企业集成模式》将死信通道正式确立为不可送达消息的消息传递模式"
      ],
      [
        "2007",
        "Nygard的《发布！》将快速失败和稳定性模式确立为生产系统的必需"
      ],
      [
        "2012",
        "Netflix开源Hystrix，在微服务中普及重试、回退和断路器模式"
      ],
      [
        "2018",
        "Nygard的《发布！》第二版扩展了错误处理模式，加入云原生和容器时代的实践"
      ],
      [
        "2022",
        "Polly（.NET）、Resilience4j（Java）和cockatiel（Node）等弹性库成为生产服务的标准依赖"
      ]
    ],
    "dos": [
      "Do classify errors as transient or permanent at the point of occurrence because the correct handling strategy depends entirely on this distinction",
      "Do add jitter to retry delays because synchronized retries from multiple clients create thundering herd problems that worsen outages",
      "Do make retried operations idempotent because non-idempotent retries can cause duplicate charges, messages, or state mutations",
      "Do log every error with sufficient context because debugging production errors without request IDs, timestamps, and stack traces wastes hours"
    ],
    "dos_zh": [
      "在发生时将错误分类为暂时性或永久性，因为正确的处理策略完全取决于这一区分",
      "为重试延迟添加抖动，因为多个客户端的同步重试会产生惊群效应，加剧故障",
      "使被重试的操作具有幂等性，因为非幂等的重试可能导致重复收费、消息或状态变更",
      "用足够的上下文记录每个错误，因为没有请求ID、时间戳和堆栈跟踪的生产错误调试会浪费数小时"
    ],
    "donts": [
      "Don't catch and swallow exceptions silently because hidden errors accumulate into mysterious system degradation",
      "Don't retry permanent errors because retrying a 400 Bad Request or 404 Not Found wastes resources and delays actual error handling",
      "Don't use unbounded retries because infinite retry loops can amplify failures and consume all available resources",
      "Don't return raw internal errors to users because stack traces and internal messages expose security vulnerabilities and confuse end users"
    ],
    "donts_zh": [
      "不要静默捕获并吞掉异常，因为隐藏的错误会累积成神秘的系统退化",
      "不要重试永久性错误，因为重试400 Bad Request或404 Not Found会浪费资源并延迟实际的错误处理",
      "不要使用无限重试，因为无限重试循环会放大故障并消耗所有可用资源",
      "不要向用户返回原始内部错误，因为堆栈跟踪和内部消息会暴露安全漏洞并困扰终端用户"
    ],
    "case_study_company": "Uber",
    "case_study": "Uber implemented a layered error handling strategy across their ride-matching pipeline after a cascading failure caused a 15-minute outage during peak hours. They introduced fail-fast validation at API gateways, retry with exponential backoff and jitter for inter-service calls, cached fallbacks for pricing estimates, and dead letter queues for failed payment events. This architecture reduced cascading failures by 80% and ensured that payment events were never lost, even during partial outages.",
    "case_study_zh": "Uber在一次级联故障导致高峰时段15分钟中断后，在其出行匹配管道中实施了分层错误处理策略。他们在API网关引入快速失败验证，为服务间调用添加指数退避和抖动重试，为价格估算提供缓存回退，为失败的支付事件设置死信队列。这一架构将级联故障减少了80%，确保支付事件即使在部分中断期间也不会丢失。",
    "when_not_to_use": [
      "Simple synchronous scripts or CLI tools where errors should immediately terminate execution and print a message",
      "Stateless pure functions where errors are return values and traditional exception handling adds unnecessary complexity",
      "Systems in early prototype phase where resilience engineering is premature and slows iteration speed"
    ],
    "when_not_to_use_zh": [
      "简单的同步脚本或CLI工具，错误应立即终止执行并打印消息",
      "无状态纯函数，错误是返回值，传统异常处理增加不必要的复杂性",
      "早期原型阶段的系统，弹性工程为时过早且减慢迭代速度"
    ],
    "adopters": [
      "Uber",
      "Netflix",
      "Stripe",
      "Amazon",
      "LinkedIn"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Michael Nygard (2007). \"Release It! Design and Deploy Production-Ready Software\". Pragmatic Bookshelf.",
    "secondary_sources": [
      "Michael Nygard (2018). \"Release It! Design and Deploy Production-Ready Software, 2nd Edition\". Pragmatic Bookshelf.",
      "Robert C. Martin (2008). \"Clean Code: A Handbook of Agile Software Craftsmanship\". Prentice Hall. Chapter 7: Error Handling."
    ],
    "typed_relations": [
      {
        "slug": "circuit-breaker-pattern",
        "type": "complement"
      },
      {
        "slug": "bulkhead-pattern",
        "type": "complement"
      },
      {
        "slug": "chaos-engineering",
        "type": "related"
      }
    ]
  },
  {
    "id": 182,
    "name": "Observability-Driven Development",
    "name_zh": "可观测性驱动开发",
    "slug": "observability-driven-development",
    "category": "quality",
    "desc": "Design for observability from the start, not after — build systems that explain their own behavior",
    "desc_zh": "从一开始就为可观测性设计，而非事后补救——构建能解释自身行为的系统",
    "steps": [
      "Define observability requirements during design: identify what questions operators will need to answer and what signals will provide those answers",
      "Instrument code with structured logging, metrics, and distributed traces from day one, treating observability as a first-class feature",
      "Establish SLIs and SLOs early: define service level indicators and objectives that drive alerting, dashboards, and capacity decisions",
      "Build dashboards and runbooks alongside features: every new feature ships with the monitoring needed to operate it in production",
      "Practice observability in development: use the same tools locally that run in production so developers build intuition for system behavior"
    ],
    "steps_zh": [
      "在设计阶段定义可观测性需求：确定运维人员需要回答的问题以及提供这些答案的信号",
      "从第一天起用结构化日志、指标和分布式追踪检测代码，将可观测性视为一等特性",
      "尽早建立SLI和SLO：定义驱动告警、仪表盘和容量决策的服务水平指标和目标",
      "将仪表盘和运行手册与功能一起构建：每个新功能都附带在生产中运维所需的监控",
      "在开发中实践可观测性：在本地使用与生产中运行的相同工具，使开发者建立对系统行为的直觉"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Instrument",
      "Emit",
      "Query",
      "Act"
    ],
    "viz_labels_zh": [
      "埋点",
      "上报",
      "查询",
      "响应"
    ],
    "related": [
      "error-budget-policy",
      "chaos-engineering",
      "twelve-factor-app"
    ],
    "tags": [
      "observability",
      "monitoring",
      "logging",
      "tracing",
      "sre"
    ],
    "origin_author": "Charity Majors, 2018",
    "origin_source": "Release It! Design and Deploy Production-Ready Software (Nygard, 2018); Continuous Delivery (Humble & Farley, 2010)",
    "origin_source_zh": "《发布！设计与部署生产就绪软件》（Nygard，2018）；《持续交付》（Humble & Farley，2010）",
    "complexity": "advanced",
    "when_to_use": [
      "Starting a new service or system where observability can be designed in from the architecture phase",
      "Operating distributed microservices where request paths cross multiple services and debugging requires correlated traces",
      "After experiencing production incidents where lack of telemetry made root cause analysis slow or impossible",
      "Building platforms that other teams depend on, where SLOs and error budgets govern reliability contracts"
    ],
    "when_to_use_zh": [
      "启动新服务或系统，可以从架构阶段开始设计可观测性",
      "运营分布式微服务，请求路径跨越多个服务，调试需要关联追踪",
      "在经历生产事故后，缺乏遥测使根因分析缓慢或不可能",
      "构建其他团队依赖的平台，SLO和错误预算管理可靠性契约"
    ],
    "core_concepts": [
      "Three Pillars: Logs (discrete events), metrics (aggregated measurements), and traces (request-scoped causal chains) as complementary observability signals",
      "Structured Logging: Emitting log events as key-value pairs or JSON instead of unstructured text, enabling machine parsing and querying",
      "Distributed Tracing: Propagating trace IDs across service boundaries to reconstruct the full lifecycle of a request through the system",
      "SLI/SLO: Service Level Indicators measure what users experience; Service Level Objectives define acceptable thresholds for those indicators",
      "High Cardinality: The ability to query telemetry by any combination of dimensions (user ID, request ID, feature flag) to debug unique issues"
    ],
    "core_concepts_zh": [
      "三大支柱：日志（离散事件）、指标（聚合度量）和追踪（请求范围的因果链）作为互补的可观测性信号",
      "结构化日志：以键值对或JSON形式发出日志事件，而非非结构化文本，支持机器解析和查询",
      "分布式追踪：跨服务边界传播追踪ID，重建请求在系统中的完整生命周期",
      "SLI/SLO：服务水平指标衡量用户体验；服务水平目标为这些指标定义可接受的阈值",
      "高基数：能够按任意维度组合（用户ID、请求ID、特性标志）查询遥测数据以调试唯一问题"
    ],
    "timeline": [
      [
        "2010",
        "Humble and Farley emphasize monitoring and feedback loops as essential to continuous delivery pipelines"
      ],
      [
        "2015",
        "Google publishes the SRE book, codifying SLIs, SLOs, and error budgets as industry standards"
      ],
      [
        "2018",
        "Charity Majors coins 'observability-driven development' and advocates designing for debuggability from the start"
      ],
      [
        "2019",
        "OpenTelemetry launches as a unified standard for traces, metrics, and logs, merging OpenTracing and OpenCensus"
      ],
      [
        "2023",
        "Observability platforms integrate AI-powered anomaly detection, reducing alert fatigue and accelerating root cause analysis"
      ]
    ],
    "timeline_zh": [
      [
        "2010",
        "Humble和Farley强调监控和反馈循环是持续交付管道的必需"
      ],
      [
        "2015",
        "Google发布SRE书籍，将SLI、SLO和错误预算编纂为行业标准"
      ],
      [
        "2018",
        "Charity Majors提出'可观测性驱动开发'，倡导从一开始就为可调试性设计"
      ],
      [
        "2019",
        "OpenTelemetry作为追踪、指标和日志的统一标准推出，合并了OpenTracing和OpenCensus"
      ],
      [
        "2023",
        "可观测性平台集成AI驱动的异常检测，减少告警疲劳并加速根因分析"
      ]
    ],
    "dos": [
      "Do instrument code during development because retrofitting observability into running systems is 10x more expensive and error-prone",
      "Do use structured logging everywhere because unstructured log messages cannot be efficiently queried, aggregated, or correlated",
      "Do propagate trace context across all service boundaries because broken trace chains make distributed debugging impossible",
      "Do define SLOs before launching because without explicit reliability targets there is no objective basis for operational decisions"
    ],
    "dos_zh": [
      "在开发期间检测代码，因为对运行中的系统改造可观测性成本高10倍且容易出错",
      "在所有地方使用结构化日志，因为非结构化日志消息无法高效查询、聚合或关联",
      "在所有服务边界传播追踪上下文，因为断裂的追踪链使分布式调试不可能",
      "在启动前定义SLO，因为没有明确的可靠性目标就没有运维决策的客观基础"
    ],
    "donts": [
      "Don't add observability as an afterthought because post-hoc instrumentation misses critical paths and creates blind spots",
      "Don't alert on every metric because alert fatigue causes teams to ignore real incidents hidden among noise",
      "Don't collect telemetry without retention policies because unbounded storage costs grow exponentially and degrade query performance",
      "Don't treat logs, metrics, and traces as independent systems because correlated signals provide 10x more diagnostic value than isolated ones"
    ],
    "donts_zh": [
      "不要将可观测性作为事后想法添加，因为事后检测会遗漏关键路径并产生盲区",
      "不要对每个指标都设置告警，因为告警疲劳会导致团队忽略隐藏在噪音中的真实事故",
      "不要在没有保留策略的情况下收集遥测数据，因为无限的存储成本会指数增长并降低查询性能",
      "不要将日志、指标和追踪视为独立系统，因为关联的信号比孤立的信号提供10倍以上的诊断价值"
    ],
    "case_study_company": "Honeycomb",
    "case_study": "Honeycomb, founded by Charity Majors, practices observability-driven development as their core engineering philosophy. Every feature is designed with high-cardinality instrumentation from the start, enabling engineers to ask arbitrary questions about production behavior without pre-defining dashboards. When a customer reported intermittent slow queries, an engineer used trace-level analysis to identify that a specific combination of tenant size, query complexity, and time-of-day caused cache eviction storms — a root cause that traditional monitoring would have taken days to identify but was found in under 30 minutes with proper observability.",
    "case_study_zh": "由Charity Majors创立的Honeycomb将可观测性驱动开发作为其核心工程理念。每个功能从一开始就以高基数检测设计，使工程师能够在不预定义仪表盘的情况下对生产行为提出任意问题。当一位客户报告间歇性慢查询时，工程师使用追踪级分析确定了特定的租户大小、查询复杂度和时间组合导致了缓存驱逐风暴——这一根因使用传统监控需要数天才能识别，但通过适当的可观测性在30分钟内就被发现。",
    "when_not_to_use": [
      "Tiny single-process applications where simple logging and error output provide sufficient operational visibility",
      "Short-lived scripts or batch jobs where execution is deterministic and debugging relies on input-output validation",
      "Very early prototypes where operational concerns are irrelevant and shipping speed is the only priority"
    ],
    "when_not_to_use_zh": [
      "微小的单进程应用，简单的日志和错误输出提供足够的运维可见性",
      "短期脚本或批处理作业，执行是确定性的，调试依赖输入输出验证",
      "非常早期的原型，运维关注点不相关，交付速度是唯一优先级"
    ],
    "adopters": [
      "Honeycomb",
      "Google",
      "Slack",
      "GitHub",
      "Lightstep"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "observability",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Charity Majors (2018). \"Observability: A Manifesto\". charity.wtf.",
    "secondary_sources": [
      "Charity Majors, Liz Fong-Jones, and George Miranda (2022). \"Observability Engineering\". O'Reilly Media.",
      "Cindy Sridharan (2018). \"Distributed Systems Observability\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "error-budget-policy",
        "type": "complement"
      },
      {
        "slug": "chaos-engineering",
        "type": "complement"
      },
      {
        "slug": "twelve-factor-app",
        "type": "complement"
      }
    ]
  },
  {
    "id": 269,
    "name": "Continuous Testing",
    "name_zh": "持续测试",
    "slug": "continuous-testing",
    "category": "quality",
    "desc": "Automated testing at every pipeline stage to provide continuous feedback on software quality throughout the delivery lifecycle",
    "desc_zh": "在每个流水线阶段自动化测试，在交付生命周期中持续提供软件质量反馈",
    "steps": [
      "Map tests to pipeline stages: assign unit tests to the commit stage, integration and contract tests to the integration stage, performance and security tests to the pre-production stage, and smoke tests to the production stage",
      "Instrument every pipeline stage with quality gates: define pass/fail thresholds (coverage minimum, error rate maximum, performance budget) that must be met before progressing to the next stage",
      "Shift tests left: run the fastest and cheapest tests as early as possible in the pipeline so developers receive feedback within minutes of committing, not hours",
      "Parallelize test execution: split test suites across concurrent workers and use test impact analysis to run only the tests affected by each code change, reducing pipeline duration",
      "Treat test failures as deployment blockers: configure the pipeline to fail fast on any test stage regression and enforce a team norm that no new work is started until the pipeline is green"
    ],
    "steps_zh": [
      "将测试映射到流水线阶段：将单元测试分配到提交阶段，将集成和契约测试分配到集成阶段，将性能和安全测试分配到预生产阶段，将冒烟测试分配到生产阶段",
      "在每个流水线阶段设置质量门：定义进入下一阶段前必须满足的通过/失败阈值（最低覆盖率、最大错误率、性能预算）",
      "将测试左移：尽早在流水线中运行最快、最廉价的测试，使开发者在提交后数分钟内而非数小时后收到反馈",
      "并行化测试执行：将测试套件分散到并发工作节点，并使用测试影响分析只运行受每次代码变更影响的测试，减少流水线持续时间",
      "将测试失败视为部署阻止器：将流水线配置为在任何测试阶段回归时快速失败，并强制团队规范：在流水线变为绿色之前不启动新工作"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Commit",
      "Build",
      "Test",
      "Report"
    ],
    "viz_labels_zh": [
      "提交",
      "构建",
      "测试",
      "报告"
    ],
    "related": [
      "test-pyramid",
      "tdd",
      "chaos-engineering",
      "load-testing-patterns"
    ],
    "tags": [
      "continuous-testing",
      "ci-cd",
      "quality-gates",
      "shift-left",
      "devops"
    ],
    "origin_author": "Wayne Ariola",
    "origin_source": "Ariola, W. & Cois, C. (2014). Continuous Testing for DevOps Professionals. Addison-Wesley; Humble, J. & Farley, D. (2010). Continuous Delivery. Addison-Wesley.",
    "origin_source_zh": "Ariola与Cois（2014）《DevOps专业人员的持续测试》Addison-Wesley；Humble与Farley（2010）《持续交付》Addison-Wesley",
    "complexity": "intermediate",
    "when_to_use": [
      "When a team is adopting continuous delivery or continuous deployment and needs quality signals at every pipeline stage rather than only at release time",
      "When deployment frequency is increasing and manual testing cycles can no longer keep pace with the release cadence",
      "When production incidents frequently originate from regressions that a faster feedback loop would have caught before deployment",
      "When multiple teams contribute to a shared codebase and you need automated quality gates to prevent integration failures from reaching downstream stages"
    ],
    "when_to_use_zh": [
      "当团队采用持续交付或持续部署，需要在每个流水线阶段而非仅在发布时获得质量信号时",
      "当部署频率增加，手动测试周期无法跟上发布节奏时",
      "当生产事故频繁源于更快反馈循环本可在部署前捕获的回归时",
      "当多个团队贡献同一代码库，需要自动化质量门防止集成故障到达下游阶段时"
    ],
    "core_concepts": [
      "Shift-left testing: moving test execution earlier in the pipeline so defects are detected and fixed when the cost of change is lowest — at commit time, not production time",
      "Quality gate: a defined threshold (coverage percentage, performance benchmark, security score) that the pipeline enforces as a binary pass/fail before stage promotion",
      "Test impact analysis: static or dynamic analysis that determines which tests are affected by a code change so only relevant tests run, reducing pipeline cycle time",
      "Environment parity: test environments at each pipeline stage must faithfully replicate production configuration, data volumes, and infrastructure to prevent environment-specific defects from escaping",
      "Continuous feedback loop: test results are published to developer dashboards and notification channels within minutes of commit so that the team can act on quality regressions immediately"
    ],
    "core_concepts_zh": [
      "测试左移：将测试执行移至流水线更早期，使缺陷在变更成本最低时被检测和修复——在提交时而非生产时",
      "质量门：流水线在阶段晋升前强制执行的定义阈值（覆盖率百分比、性能基准、安全评分），作为二元通过/失败判断",
      "测试影响分析：确定哪些测试受代码变更影响的静态或动态分析，只运行相关测试，减少流水线周期时间",
      "环境一致性：每个流水线阶段的测试环境必须忠实复制生产配置、数据量和基础设施，防止环境特定缺陷逃逸",
      "持续反馈循环：测试结果在提交后数分钟内发布到开发者仪表板和通知渠道，使团队能立即对质量回归采取行动"
    ],
    "timeline": [
      [
        "2006",
        "ThoughtWorks and Jez Humble begin codifying continuous delivery practices that place automated testing at every pipeline stage"
      ],
      [
        "2010",
        "Humble and Farley publish 'Continuous Delivery', establishing the deployment pipeline model with testing as a first-class concern at every stage"
      ],
      [
        "2014",
        "Wayne Ariola and Cois publish 'Continuous Testing for DevOps Professionals', coining the term and defining the practice as distinct from CI test automation"
      ],
      [
        "2019",
        "AI-assisted test generation tools emerge, enabling continuous testing to automatically expand test coverage as code changes without manual test authoring"
      ]
    ],
    "timeline_zh": [
      [
        "2006",
        "ThoughtWorks 和 Jez Humble 开始整理持续交付实践，将自动化测试置于每个流水线阶段"
      ],
      [
        "2010",
        "Humble 和 Farley 出版《持续交付》，建立了以测试为每个阶段一级关注点的部署流水线模型"
      ],
      [
        "2014",
        "Wayne Ariola 和 Cois 出版《DevOps专业人员的持续测试》，创造了该术语并将其定义为区别于 CI 测试自动化的独立实践"
      ],
      [
        "2019",
        "AI 辅助测试生成工具出现，使持续测试能随代码变更自动扩展测试覆盖率，无需手动编写测试"
      ]
    ],
    "dos": [
      "Do define quality gates with objective, measurable thresholds rather than subjective criteria so the pipeline can enforce them automatically without human judgment",
      "Do maintain a fast commit stage (under 10 minutes) by restricting it to unit tests and static analysis — move slower tests to later pipeline stages where they run in parallel",
      "Do track test flakiness as a first-class metric: quarantine flaky tests immediately because they erode trust in the entire pipeline and cause teams to ignore legitimate failures",
      "Do integrate security scanning (SAST, dependency audit) as a pipeline stage gate rather than a post-release activity so vulnerabilities are caught before they reach production"
    ],
    "dos_zh": [
      "用客观、可衡量的阈值而非主观标准定义质量门，使流水线能无需人工判断地自动执行",
      "通过将提交阶段限制为单元测试和静态分析来保持其快速执行（10 分钟以内）——将较慢的测试移至后续并行运行的流水线阶段",
      "将测试不稳定性作为一级指标追踪：立即隔离不稳定测试，因为它们侵蚀对整个流水线的信任，导致团队忽略合理失败",
      "将安全扫描（SAST、依赖审计）集成为流水线阶段门而非发布后活动，使漏洞在到达生产前被捕获"
    ],
    "donts": [
      "Don't treat test failures as optional or allow the pipeline to continue with known failures — a broken pipeline that teams work around teaches engineers to ignore quality signals",
      "Don't run all tests at every stage — this maximizes feedback time and pipeline cost; stage-appropriate test selection is essential for sustainable continuous testing",
      "Don't skip environment parity for lower pipeline stages — tests that pass in an under-resourced environment and fail in production provide false confidence rather than real quality assurance",
      "Don't conflate continuous testing with continuous integration — CI ensures code integrates cleanly; continuous testing ensures quality is validated continuously at every pipeline stage"
    ],
    "donts_zh": [
      "不要将测试失败视为可选的或允许流水线在已知失败的情况下继续——让团队绕过损坏流水线会训练工程师忽略质量信号",
      "不要在每个阶段运行所有测试——这会最大化反馈时间和流水线成本；针对阶段的测试选择对可持续持续测试至关重要",
      "不要为较低流水线阶段跳过环境一致性——在资源不足的环境中通过但在生产中失败的测试提供虚假信心而非真实质量保证",
      "不要将持续测试与持续集成混淆——CI 确保代码干净集成；持续测试确保质量在每个流水线阶段持续验证"
    ],
    "case_study_company": "Etsy",
    "case_study": "Etsy is widely cited as an early continuous delivery pioneer deploying to production over 50 times per day. Their continuous testing practice assigns different test suites to each pipeline stage: unit tests run on every commit and must complete within 8 minutes; integration tests covering critical buyer and seller flows run in parallel on dedicated workers; and a canary stage deploys the change to 1% of traffic with automated error rate monitoring acting as a production quality gate. When error rates exceed the baseline by more than 0.1%, the canary is automatically rolled back without human intervention. This architecture allowed Etsy to maintain sub-hour time-to-production for most changes while keeping incident rates lower than competitors deploying quarterly.",
    "case_study_zh": "Etsy 被广泛引用为早期持续交付先驱，每天向生产环境部署超过 50 次。他们的持续测试实践将不同的测试套件分配到每个流水线阶段：单元测试在每次提交时运行，必须在 8 分钟内完成；覆盖关键买家和卖家流程的集成测试在专用工作节点上并行运行；金丝雀阶段将变更部署到 1% 的流量，以自动错误率监控作为生产质量门。当错误率超过基线 0.1% 时，金丝雀会自动回滚而无需人工干预。这一架构使 Etsy 能为大多数变更保持不到一小时的生产到达时间，同时使事故率低于每季度部署的竞争对手。",
    "when_not_to_use": [
      "Very early prototyping phases where requirements are too unstable to justify maintaining a test suite that will be completely rewritten",
      "One-off scripts or data migration utilities with extremely short lifespans where the investment in pipeline infrastructure exceeds the risk reduction benefit",
      "Projects with extremely low release frequency (once a year) where the infrastructure overhead of continuous testing does not justify the cycle time reduction"
    ],
    "when_not_to_use_zh": [
      "需求过于不稳定、不值得维护将被完全重写的测试套件的极早期原型阶段",
      "寿命极短的一次性脚本或数据迁移工具，流水线基础设施投入超过风险降低收益",
      "发布频率极低（每年一次）的项目，持续测试的基础设施开销无法证明周期时间缩短的合理性"
    ],
    "adopters": [
      "Etsy",
      "Google",
      "Amazon",
      "Netflix",
      "Spotify"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "testability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Ariola, W. & Cois, C. (2014). \"Continuous Testing for DevOps Professionals\". Addison-Wesley Professional.",
    "secondary_sources": [
      "Humble, J. & Farley, D. (2010). \"Continuous Delivery\". Addison-Wesley Professional.",
      "Kim, G. et al. (2016). \"The DevOps Handbook\". IT Revolution Press.",
      "Forsgren, N., Humble, J. & Kim, G. (2018). \"Accelerate: The Science of Lean Software and DevOps\". IT Revolution Press."
    ],
    "typed_relations": [
      {
        "slug": "test-pyramid",
        "type": "complement"
      },
      {
        "slug": "tdd",
        "type": "complement"
      },
      {
        "slug": "chaos-engineering",
        "type": "complement"
      }
    ]
  },
  {
    "id": 270,
    "name": "Visual Regression Testing",
    "name_zh": "视觉回归测试",
    "slug": "visual-regression-testing",
    "category": "quality",
    "desc": "Screenshot comparison between baseline and current UI to catch unintended visual changes automatically",
    "desc_zh": "通过对比基准截图与当前 UI 截图自动捕获意外视觉变更",
    "steps": [
      "Capture baseline screenshots: render each component or page in a controlled environment (headless browser, Storybook) and store reference images that represent the approved visual state",
      "Run visual comparisons on every pull request: re-render the same components after code changes and use pixel-diff or perceptual hashing algorithms to detect changes above a configured tolerance threshold",
      "Review and triage visual diffs: present detected changes to developers as annotated diff images highlighting changed pixels; require explicit approval to update baselines for intentional changes",
      "Integrate into CI as a blocking check: configure the visual testing job as a required status check so PRs with unapproved visual changes cannot be merged until a designated reviewer approves the diff",
      "Maintain baseline hygiene: version-control baseline images alongside code, automate stale baseline cleanup, and re-capture baselines on dependency upgrades (browser versions, rendering libraries) that cause non-functional pixel shifts"
    ],
    "steps_zh": [
      "捕获基准截图：在受控环境（无头浏览器、Storybook）中渲染每个组件或页面，存储代表已批准视觉状态的参考图像",
      "在每个拉取请求上运行视觉比较：代码变更后重新渲染相同组件，使用像素差异或感知哈希算法检测超过配置容差阈值的变更",
      "审查和分类视觉差异：以带注释的差异图像（高亮变更像素）形式向开发者呈现检测到的变更；要求显式批准以更新有意变更的基准",
      "作为阻止性检查集成到 CI：将视觉测试作业配置为必需的状态检查，使带有未批准视觉变更的 PR 在指定审阅者批准差异前无法合并",
      "维护基准卫生：将基准图像与代码一起版本控制，自动清理过期基准，并在导致非功能性像素偏移的依赖升级（浏览器版本、渲染库）时重新捕获基准"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Baseline",
      "Capture",
      "Diff",
      "Approve"
    ],
    "viz_labels_zh": [
      "基准截图",
      "当前截图",
      "差异对比",
      "审核通过"
    ],
    "related": [
      "snapshot-testing",
      "test-pyramid",
      "continuous-testing",
      "bdd"
    ],
    "tags": [
      "visual-testing",
      "screenshot-comparison",
      "ui-regression",
      "pixel-diff",
      "storybook"
    ],
    "origin_author": "Percy.io",
    "origin_source": "Percy.io (2014, founded by Mike Fotinakis); Applitools Eyes visual AI testing platform; Perceptual Image Hash (pHash) algorithm by Zauner (2010).",
    "origin_source_zh": "Percy.io（2014年由Mike Fotinakis创立）；Applitools Eyes 视觉 AI 测试平台；Zauner（2010）感知图像哈希（pHash）算法",
    "complexity": "intermediate",
    "when_to_use": [
      "When shipping UI components that must remain visually consistent across browser updates, dependency changes, and CSS refactors",
      "When maintaining a design system or component library where visual regressions in shared components can affect hundreds of consumer screens simultaneously",
      "When your release process includes manual visual QA that you want to automate to reduce review time and human error",
      "When teams work across different operating systems and browsers that can subtly affect rendering and you need cross-environment baseline comparison"
    ],
    "when_to_use_zh": [
      "交付的 UI 组件必须在浏览器更新、依赖变更和 CSS 重构中保持视觉一致性时",
      "维护设计系统或组件库，共享组件的视觉回归可能同时影响数百个消费者页面时",
      "发布流程包含手动视觉 QA，你希望自动化以减少审查时间和人为错误时",
      "团队跨不同操作系统和浏览器工作，这些可能微妙地影响渲染，你需要跨环境基准比较时"
    ],
    "core_concepts": [
      "Pixel-level diffing: comparing rendered screenshots pixel-by-pixel and highlighting any differences; sensitive but prone to false positives from anti-aliasing, font hinting, and sub-pixel rendering variation",
      "Perceptual hashing: converting images to compact perceptual fingerprints (pHash, dHash) that are tolerant of minor rendering variations while still detecting structural visual changes",
      "Baseline management: the process of capturing, storing, reviewing, and updating reference screenshots that define the approved visual state of each component or page",
      "Component-level vs. page-level testing: testing individual components in isolation (via Storybook) provides more stable baselines than full-page screenshots, which are affected by dynamic content and state",
      "AI-powered visual matching: services like Applitools use machine learning to distinguish intentional design changes from unintentional regressions, reducing false positive rates in complex UIs"
    ],
    "core_concepts_zh": [
      "像素级差异比较：逐像素比较渲染截图并高亮任何差异；灵敏但容易因抗锯齿、字体微调和次像素渲染变化产生误报",
      "感知哈希：将图像转换为紧凑的感知指纹（pHash、dHash），对微小渲染变化容错，同时仍能检测结构性视觉变更",
      "基准管理：捕获、存储、审查和更新定义每个组件或页面已批准视觉状态的参考截图的过程",
      "组件级与页面级测试：在隔离环境（通过 Storybook）中测试单个组件比全页面截图提供更稳定的基准，后者受动态内容和状态影响",
      "AI 驱动的视觉匹配：Applitools 等服务使用机器学习区分有意的设计变更和无意的回归，降低复杂 UI 中的误报率"
    ],
    "timeline": [
      [
        "2012",
        "PhantomCSS emerges as one of the first open-source visual regression testing tools using PhantomJS for headless screenshot capture and comparison"
      ],
      [
        "2014",
        "Percy.io founded by Mike Fotinakis at Stripe, launching the first SaaS visual testing platform integrated with GitHub pull request workflows"
      ],
      [
        "2017",
        "Storybook's storyshots addon enables component-level visual testing without a full browser environment, dramatically reducing baseline stability issues"
      ],
      [
        "2020",
        "Playwright and Cypress add native screenshot comparison APIs, making visual regression testing accessible without third-party SaaS dependency"
      ]
    ],
    "timeline_zh": [
      [
        "2012",
        "PhantomCSS 作为最早的开源视觉回归测试工具之一出现，使用 PhantomJS 进行无头截图捕获和比较"
      ],
      [
        "2014",
        "Percy.io 由 Mike Fotinakis 在 Stripe 创立，推出第一个与 GitHub 拉取请求工作流集成的 SaaS 视觉测试平台"
      ],
      [
        "2017",
        "Storybook 的 storyshots 插件实现无需完整浏览器环境的组件级视觉测试，大幅减少基准稳定性问题"
      ],
      [
        "2020",
        "Playwright 和 Cypress 添加原生截图比较 API，使视觉回归测试无需第三方 SaaS 依赖即可访问"
      ]
    ],
    "dos": [
      "Do test components in isolation via Storybook rather than full pages wherever possible — isolated components produce stable, deterministic baselines free of dynamic content noise",
      "Do configure a pixel-difference tolerance (typically 0.1-0.5%) to account for sub-pixel rendering differences across platforms rather than requiring pixel-perfect matches",
      "Do store baseline images in version control alongside the code they test so that baseline history is tied to code history and rollbacks are consistent",
      "Do run visual tests in a fixed, reproducible browser environment (pinned browser version, fixed viewport, deterministic fonts) to prevent environment drift from causing false positives"
    ],
    "dos_zh": [
      "尽可能通过 Storybook 在隔离环境中测试组件而非整页——隔离组件产生稳定、确定性的基准，不受动态内容噪声影响",
      "配置像素差异容差（通常 0.1-0.5%）以适应跨平台的次像素渲染差异，而非要求像素完美匹配",
      "将基准图像与其测试的代码一起存储在版本控制中，使基准历史与代码历史绑定，回滚保持一致",
      "在固定、可复现的浏览器环境（固定浏览器版本、固定视口、确定性字体）中运行视觉测试，防止环境漂移导致误报"
    ],
    "donts": [
      "Don't apply visual regression testing to every page without component isolation — full-page screenshots with dynamic data (dates, counts, user content) generate constant false positives",
      "Don't auto-approve baseline updates in CI without human review — silent auto-updates are how visual regressions slip undetected into the approved baseline",
      "Don't use visual regression as a substitute for functional testing — it catches layout and style changes but not behavioral regressions, missing logic bugs entirely",
      "Don't neglect cross-browser baseline maintenance — Chrome and Firefox render fonts and shadows differently, requiring separate baseline sets if cross-browser fidelity is a requirement"
    ],
    "donts_zh": [
      "不要在没有组件隔离的情况下将视觉回归测试应用于每个页面——包含动态数据（日期、计数、用户内容）的全页面截图会产生持续误报",
      "不要在 CI 中不经人工审查自动批准基准更新——静默自动更新是视觉回归悄然进入已批准基准的途径",
      "不要将视觉回归测试作为功能测试的替代——它捕获布局和样式变更而非行为回归，完全遗漏逻辑缺陷",
      "不要忽视跨浏览器基准维护——Chrome 和 Firefox 对字体和阴影的渲染不同，若跨浏览器保真度是需求，则需要独立的基准集"
    ],
    "case_study_company": "Shopify",
    "case_study": "Shopify's Polaris design system team uses visual regression testing extensively to protect the consistency of their shared component library across hundreds of Shopify products. They run Percy on every Storybook story for all 100+ Polaris components on each pull request. Before adopting visual testing, CSS refactors frequently introduced subtle regressions in border radii, spacing, or color contrast that passed code review but were only caught by QA engineers — sometimes after the regression had propagated to merchant-facing pages. After adopting Percy, the time to detect and fix visual regressions dropped from days to the same pull request cycle, and the design system team reduced their manual visual review effort by approximately 70%.",
    "case_study_zh": "Shopify 的 Polaris 设计系统团队广泛使用视觉回归测试，保护其共享组件库在数百个 Shopify 产品中的一致性。他们在每次拉取请求时对所有 100+ Polaris 组件的每个 Storybook story 运行 Percy。采用视觉测试前，CSS 重构频繁在边框半径、间距或颜色对比度上引入细微回归，这些通过代码审查但只被 QA 工程师发现——有时在回归已传播到商家可见页面后才发现。采用 Percy 后，检测和修复视觉回归的时间从数天缩短到同一拉取请求周期，设计系统团队减少了约 70% 的手动视觉审查工作量。",
    "when_not_to_use": [
      "Backend services, APIs, or data processing pipelines where there is no visual output to compare",
      "Highly dynamic UIs with real-time data, animations, or user-generated content that make stable baselines impossible without extensive mocking",
      "Very early-stage projects with rapidly changing designs where maintaining baselines costs more than the regressions they would catch"
    ],
    "when_not_to_use_zh": [
      "没有可比较视觉输出的后端服务、API 或数据处理流水线",
      "包含实时数据、动画或用户生成内容的高度动态 UI，在没有大量 Mock 的情况下稳定基准是不可能的",
      "设计快速变化的极早期项目，维护基准的成本超过它们能捕获的回归"
    ],
    "adopters": [
      "Shopify (Polaris)",
      "Atlassian",
      "GitHub",
      "Storybook community",
      "Stripe"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "testability",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Percy.io Documentation. \"Visual Testing Handbook\". percy.io/docs.",
    "secondary_sources": [
      "Applitools (2020). \"Visual AI Testing Guide\". applitools.com/resources.",
      "Playwright Documentation. \"Screenshots\". playwright.dev/docs/screenshots.",
      "Cromwell, T. (2019). \"Visual Regression Testing in Practice\". CSS-Tricks."
    ],
    "typed_relations": [
      {
        "slug": "snapshot-testing",
        "type": "complement"
      },
      {
        "slug": "continuous-testing",
        "type": "complement"
      },
      {
        "slug": "test-pyramid",
        "type": "complement"
      }
    ]
  },
  {
    "id": 271,
    "name": "Pact Contract Testing",
    "name_zh": "Pact 契约测试",
    "slug": "pact-contract-testing",
    "category": "quality",
    "desc": "Consumer-driven contract verification between services ensuring API compatibility without end-to-end integration environments",
    "desc_zh": "消费者驱动的服务间契约验证，无需端到端集成环境即确保 API 兼容性",
    "steps": [
      "Write consumer-side interaction tests: using the Pact DSL, define the expected request the consumer will make and the minimum response it needs; run these against a Pact mock server to generate a pact file",
      "Publish the pact file to a Pact Broker: commit the generated pact (JSON file describing the consumer's expectations) to a shared Pact Broker so provider teams can discover and verify against it",
      "Verify the pact on the provider side: run the provider verification task that replays each interaction from the pact file against the real provider, asserting that the provider's actual response satisfies the consumer's expectations",
      "Integrate can-i-deploy into the pipeline: before deploying either consumer or provider, query the Pact Broker's can-i-deploy tool to check that the version being deployed is compatible with the versions already in each target environment",
      "Evolve the API safely: when the provider needs to change the API, run provider-driven contract evolution by publishing a new pact, checking whether any consumer contracts are broken, and coordinating the release order to ensure compatibility"
    ],
    "steps_zh": [
      "编写消费者侧交互测试：使用 Pact DSL 定义消费者将发出的预期请求和它需要的最小响应；针对 Pact Mock 服务器运行这些测试以生成 pact 文件",
      "将 pact 文件发布到 Pact Broker：将生成的 pact（描述消费者期望的 JSON 文件）提交到共享 Pact Broker，使提供者团队能够发现并针对其进行验证",
      "在提供者侧验证 pact：运行提供者验证任务，将 pact 文件中的每个交互重放到真实提供者，断言提供者的实际响应满足消费者的期望",
      "将 can-i-deploy 集成到流水线：在部署消费者或提供者之前，查询 Pact Broker 的 can-i-deploy 工具，检查要部署的版本是否与每个目标环境中已有的版本兼容",
      "安全地演进 API：当提供者需要更改 API 时，通过发布新 pact 运行提供者驱动的契约演进，检查是否有消费者契约被破坏，并协调发布顺序以确保兼容性"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Consumer",
      "Pact",
      "Provider",
      "Verify"
    ],
    "viz_labels_zh": [
      "消费方",
      "契约",
      "提供方",
      "验证"
    ],
    "related": [
      "testing-trophy",
      "continuous-testing",
      "service-discovery-pattern",
      "bdd"
    ],
    "tags": [
      "contract-testing",
      "consumer-driven",
      "microservices",
      "api-compatibility",
      "pact"
    ],
    "origin_author": "Ron Holshausen",
    "origin_source": "Holshausen, R. & Toomey, B. (2013). Pact — A Contract Testing Framework. github.com/pact-foundation/pact-ruby; Richardson, C. (2018). Microservices Patterns, Ch. 9. Manning.",
    "origin_source_zh": "Holshausen与Toomey（2013）Pact 契约测试框架 github.com/pact-foundation/pact-ruby；Richardson（2018）《微服务架构设计模式》第9章",
    "complexity": "intermediate",
    "when_to_use": [
      "When microservices are developed by independent teams and you need confidence that API changes in one service do not break other services without deploying a full integration environment",
      "When end-to-end integration tests are slow, expensive, or flaky and you want a faster feedback mechanism for API compatibility across service boundaries",
      "When you practice independent service deployment and need to verify deployment compatibility between consumer and provider versions before promoting to production",
      "When a provider serves multiple consumers with different API needs and you need visibility into which consumers depend on which parts of the API before making changes"
    ],
    "when_to_use_zh": [
      "微服务由独立团队开发，需要确信一个服务中的 API 变更不会在不部署完整集成环境的情况下破坏其他服务时",
      "端到端集成测试缓慢、昂贵或不稳定，需要更快的跨服务边界 API 兼容性反馈机制时",
      "实践独立服务部署，需要在晋升到生产前验证消费者和提供者版本之间的部署兼容性时",
      "提供者向具有不同 API 需求的多个消费者提供服务，在进行变更前需要了解哪些消费者依赖 API 的哪些部分时"
    ],
    "core_concepts": [
      "Consumer-driven contracts: the consumer defines the contract (what requests it makes and what minimal response it needs), not the provider — ensuring the API is designed for actual consumer needs rather than hypothetical capabilities",
      "Pact file: a JSON document generated by consumer tests recording the exact interactions (HTTP request/response pairs or message schemas) that the provider must verify",
      "Provider verification: the provider runs all interactions from each consumer's pact file against the real service, asserting that the provider's actual behavior satisfies all consumer contracts",
      "Pact Broker: a central store and visualisation tool for pact files that tracks which consumer version has been verified against which provider version, enabling safe independent deployments",
      "can-i-deploy: a CLI tool and API provided by the Pact Broker that answers whether a specific version of a service is safe to deploy to a given environment based on verified pact compatibility"
    ],
    "core_concepts_zh": [
      "消费者驱动的契约：消费者定义契约（它发出什么请求和需要什么最小响应），而非提供者——确保 API 为实际消费者需求而非假设能力设计",
      "Pact 文件：消费者测试生成的 JSON 文档，记录提供者必须验证的精确交互（HTTP 请求/响应对或消息模式）",
      "提供者验证：提供者针对真实服务运行来自每个消费者 pact 文件的所有交互，断言提供者的实际行为满足所有消费者契约",
      "Pact Broker：pact 文件的中央存储和可视化工具，追踪哪个消费者版本已针对哪个提供者版本验证，实现安全的独立部署",
      "can-i-deploy：Pact Broker 提供的 CLI 工具和 API，基于已验证的 pact 兼容性回答特定版本的服务是否可以安全部署到给定环境"
    ],
    "timeline": [
      [
        "2013",
        "Ron Holshausen and Beth Toomey build the first version of Pact at REA Group in Australia to solve cross-team API compatibility testing without shared integration environments"
      ],
      [
        "2015",
        "The Pact Foundation is formed and multi-language implementations (pact-jvm, pact-js, pact-python) are released, making consumer-driven contract testing available across the polyglot microservices ecosystem"
      ],
      [
        "2017",
        "Pact Broker open-source release and PactFlow (hosted Broker) launch enable enterprise-scale deployment pipeline integration with can-i-deploy CI gates"
      ],
      [
        "2021",
        "Bi-directional contract testing (BDCT) mode added to PactFlow, allowing OpenAPI specifications to serve as provider contracts without requiring provider-side Pact verification runs"
      ]
    ],
    "timeline_zh": [
      [
        "2013",
        "Ron Holshausen 和 Beth Toomey 在澳大利亚 REA Group 构建第一版 Pact，解决无需共享集成环境的跨团队 API 兼容性测试"
      ],
      [
        "2015",
        "Pact Foundation 成立，多语言实现（pact-jvm、pact-js、pact-python）发布，使消费者驱动的契约测试在多语言微服务生态系统中可用"
      ],
      [
        "2017",
        "Pact Broker 开源发布和 PactFlow（托管 Broker）推出，通过 can-i-deploy CI 门实现企业级部署流水线集成"
      ],
      [
        "2021",
        "PactFlow 添加双向契约测试（BDCT）模式，允许 OpenAPI 规范作为提供者契约，无需运行提供者侧 Pact 验证"
      ]
    ],
    "dos": [
      "Do write consumer tests against only the fields the consumer actually uses, not the entire response schema — minimal contracts make providers free to add fields without breaking consumers",
      "Do run pact verification as part of the provider's CI pipeline so that a provider change that breaks a consumer contract is caught before the change is merged",
      "Do use can-i-deploy in your deployment pipeline as a gate — it prevents deploying a service version that is incompatible with the versions already running in the target environment",
      "Do tag pact versions with environment names (dev, staging, production) in the Pact Broker to accurately reflect what is deployed where and generate precise compatibility matrices"
    ],
    "dos_zh": [
      "针对消费者实际使用的字段而非整个响应模式编写消费者测试——最小契约使提供者可以自由添加字段而不破坏消费者",
      "将 pact 验证作为提供者 CI 流水线的一部分运行，使破坏消费者契约的提供者变更在合并前被捕获",
      "在部署流水线中使用 can-i-deploy 作为门控——它防止部署与目标环境中已运行版本不兼容的服务版本",
      "在 Pact Broker 中用环境名（dev、staging、production）标记 pact 版本，准确反映哪些内容部署在哪里并生成精确的兼容性矩阵"
    ],
    "donts": [
      "Don't write provider-driven contracts where the provider defines what consumers should expect — this defeats the purpose of consumer-driven testing and produces contracts that verify provider capabilities rather than consumer needs",
      "Don't use Pact for testing UI components or browser-to-API interactions — Pact is optimized for service-to-service HTTP and messaging contracts, not browser rendering behavior",
      "Don't skip the Pact Broker and share pact files via the file system or source control — without the Broker, you lose the can-i-deploy safety check and the compatibility matrix",
      "Don't treat Pact as a replacement for all integration testing — it verifies API shape and protocol compatibility but does not cover business logic correctness, performance, or authorization behavior"
    ],
    "donts_zh": [
      "不要编写提供者驱动的契约，让提供者定义消费者应该期望什么——这违背了消费者驱动测试的目的，产生验证提供者能力而非消费者需求的契约",
      "不要将 Pact 用于测试 UI 组件或浏览器到 API 的交互——Pact 为服务间 HTTP 和消息契约优化，不适用于浏览器渲染行为",
      "不要跳过 Pact Broker 通过文件系统或源码控制共享 pact 文件——没有 Broker，你就失去了 can-i-deploy 安全检查和兼容性矩阵",
      "不要将 Pact 视为所有集成测试的替代——它验证 API 形状和协议兼容性，但不覆盖业务逻辑正确性、性能或授权行为"
    ],
    "case_study_company": "REA Group",
    "case_study": "REA Group, Australia's leading property marketplace, was Pact's birthplace. In 2013, their engineering teams were struggling with a 50+ microservices architecture where integration tests ran against a shared staging environment that was perpetually broken by concurrent team deployments. Ron Holshausen and Beth Toomey built the first Pact library to replace the shared staging environment dependency for API compatibility verification. Within six months, REA Group reduced their integration test suite from a 2-hour run against the shared environment to 8-minute pact verification runs per service team. The can-i-deploy tool allowed teams to deploy independently on their own cadence with confidence, increasing their deployment frequency from weekly to multiple times per day per service.",
    "case_study_zh": "澳大利亚领先的房产交易平台 REA Group 是 Pact 的诞生地。2013 年，他们的工程团队在 50+ 微服务架构中挣扎，集成测试针对一个因并发团队部署而持续损坏的共享预发布环境运行。Ron Holshausen 和 Beth Toomey 构建了第一个 Pact 库，以替代 API 兼容性验证对共享预发布环境的依赖。六个月内，REA Group 将集成测试套件从针对共享环境的 2 小时运行缩短为每个服务团队 8 分钟的 pact 验证运行。can-i-deploy 工具使团队能够按照自己的节奏独立部署，部署频率从每周增加到每个服务每天多次。",
    "when_not_to_use": [
      "Monolithic applications where all services run in the same process and there are no service boundaries to test API contracts across",
      "Third-party external APIs where you cannot run provider verification against the real service and must rely on published API documentation or SDKs",
      "Simple single-consumer APIs where the consumer and provider are maintained by the same team and the overhead of Pact tooling exceeds the benefit of explicit contract verification"
    ],
    "when_not_to_use_zh": [
      "所有服务在同一进程中运行、没有跨服务边界测试 API 契约的单体应用",
      "无法针对真实服务运行提供者验证、必须依赖已发布 API 文档或 SDK 的第三方外部 API",
      "简单的单消费者 API，消费者和提供者由同一团队维护，Pact 工具开销超过显式契约验证的收益"
    ],
    "adopters": [
      "REA Group",
      "Atlassian",
      "IBM",
      "ING Bank",
      "Mastercard"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "testability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Pact Foundation. \"Pact Documentation\". docs.pact.io.",
    "secondary_sources": [
      "Richardson, C. (2018). \"Microservices Patterns\", Ch. 9. Manning Publications.",
      "Newman, S. (2021). \"Building Microservices\", 2nd ed., Ch. 9. O'Reilly Media.",
      "Holshausen, R. (2013). \"Pact: A Contract Testing Framework\". github.com/pact-foundation/pact-ruby."
    ],
    "typed_relations": [
      {
        "slug": "testing-trophy",
        "type": "complement"
      },
      {
        "slug": "continuous-testing",
        "type": "complement"
      },
      {
        "slug": "bdd",
        "type": "complement"
      }
    ]
  },
  {
    "id": 301,
    "name": "Fuzz Testing",
    "name_zh": "模糊测试",
    "slug": "fuzz-testing",
    "category": "quality",
    "desc": "Automated testing technique that feeds randomly generated, malformed, or unexpected inputs to a program to discover crashes, security vulnerabilities, and undefined behaviour.",
    "desc_zh": "向程序输入随机生成、畸形或意外数据的自动化测试技术，用于发现崩溃、安全漏洞和未定义行为。",
    "steps": [
      "Define the fuzzing target: identify the parsing, deserialization, or input-processing functions that consume untrusted external input and are most likely to contain memory safety bugs or crash paths",
      "Choose a fuzzing strategy and tool: coverage-guided fuzzers (AFL++, libFuzzer, Jazzer) are most effective for complex input formats; generation-based fuzzers (Hypothesis, Atheris) are better for structured protocols and API fuzzing",
      "Write a fuzz harness: a small entry-point function that accepts raw bytes from the fuzzer engine and feeds them to the target function; the harness must not crash on arbitrary input — crashes indicate bugs, not expected harness behaviour",
      "Run the fuzzer with an initial corpus of valid example inputs to seed coverage; let it run for an extended period (hours to days) to allow the feedback-driven mutation engine to explore deep code paths",
      "Triage discovered crashes: reproduce each crash with the minimised input, identify the root cause (buffer overflow, use-after-free, integer overflow, panic), file a bug with the crash input attached, and add the crashing input to the regression corpus"
    ],
    "steps_zh": [
      "定义模糊测试目标：识别消耗不可信外部输入的解析、反序列化或输入处理函数，这些函数最有可能包含内存安全漏洞或崩溃路径",
      "选择模糊测试策略和工具：覆盖率引导的模糊器（AFL++、libFuzzer、Jazzer）对复杂输入格式最有效；基于生成的模糊器（Hypothesis、Atheris）更适合结构化协议和 API 模糊测试",
      "编写模糊测试驱动：一个小型入口函数，接受来自模糊器引擎的原始字节并将其输入目标函数；驱动不能因任意输入而崩溃——崩溃表示存在 bug，而非预期的驱动行为",
      "使用有效示例输入的初始语料库运行模糊器以播种覆盖率；让其运行较长时间（数小时至数天），允许反馈驱动的变异引擎探索深层代码路径",
      "对发现的崩溃进行分类：用最小化输入复现每个崩溃，识别根本原因（缓冲区溢出、释放后使用、整数溢出、panic），提交附带崩溃输入的 bug，并将崩溃输入添加到回归语料库"
    ],
    "ai_relevant": false,
    "viz_type": "cycle",
    "viz_labels": [
      "Seed",
      "Mutate",
      "Execute",
      "Triage"
    ],
    "viz_labels_zh": [
      "种子输入",
      "变异生成",
      "执行",
      "问题分类"
    ],
    "related": [
      "property-based-testing",
      "continuous-testing",
      "chaos-engineering"
    ],
    "tags": [
      "fuzzing",
      "security-testing",
      "afl",
      "libfuzzer",
      "random-testing",
      "vulnerability-discovery"
    ],
    "origin_author": "Barton Miller",
    "origin_source": "Miller, B., Fredriksen, L., & So, B. (1990). \"An Empirical Study of the Reliability of UNIX Utilities\". Communications of the ACM, 33(12), 32-44.",
    "origin_source_zh": "Miller, B., Fredriksen, L., & So, B.（1990）.「An Empirical Study of the Reliability of UNIX Utilities」. Communications of the ACM, 33(12), 32-44.",
    "complexity": "advanced",
    "when_to_use": [
      "Libraries and services that parse untrusted external data — file formats, network protocols, serialization formats, APIs — where unexpected input can trigger memory corruption, crashes, or security vulnerabilities",
      "Security-critical codebases in C, C++, or Rust where memory safety bugs (buffer overflows, use-after-free, integer overflows) carry high severity and traditional unit testing cannot exhaustively cover the input space",
      "Before releasing a new parser, codec, or protocol implementation to production where the cost of a post-release CVE or crash significantly exceeds the investment in pre-release fuzzing",
      "Continuous integration pipelines for open-source projects and critical infrastructure where OSS-Fuzz integration or ClusterFuzz can run fuzzing at scale between releases"
    ],
    "when_to_use_zh": [
      "解析不可信外部数据的库和服务——文件格式、网络协议、序列化格式、API——其中意外输入可能触发内存损坏、崩溃或安全漏洞",
      "C、C++ 或 Rust 中安全关键代码库，内存安全漏洞（缓冲区溢出、释放后使用、整数溢出）严重性高，传统单元测试无法穷举输入空间",
      "在将新解析器、编解码器或协议实现发布到生产环境之前，发布后 CVE 或崩溃的成本显著超过发布前模糊测试的投入",
      "开源项目和关键基础设施的持续集成流水线，OSS-Fuzz 集成或 ClusterFuzz 可以在发布间隔期间大规模运行模糊测试"
    ],
    "core_concepts": [
      "Coverage-Guided Fuzzing: the fuzzer instruments the binary to measure which code paths each input exercises; mutations that increase code coverage are retained in the corpus, enabling the fuzzer to progressively discover deeper and rarer code paths through feedback-driven mutation",
      "Fuzz Harness: a thin wrapper function that translates raw fuzzer-provided bytes into a structured call to the target function; the quality of the harness determines the fuzzing depth — a poor harness limits the fuzzer to shallow input parsing",
      "Corpus and Seed Inputs: a corpus of valid example inputs bootstraps coverage by giving the fuzzer a starting point that already exercises non-trivial code paths; seed quality directly affects time-to-first-crash on complex parsers",
      "Sanitizers: fuzz testing is most effective when combined with AddressSanitizer (ASan), UndefinedBehaviourSanitizer (UBSan), and MemorySanitizer — these instrument the binary to detect memory safety violations that do not cause immediate crashes, dramatically increasing bug discovery rate"
    ],
    "core_concepts_zh": [
      "覆盖率引导的模糊测试：模糊器对二进制文件进行插桩以测量每个输入执行了哪些代码路径；增加代码覆盖率的变异被保留在语料库中，使模糊器能够通过反馈驱动的变异逐步发现更深更罕见的代码路径",
      "模糊测试驱动：将模糊器提供的原始字节转换为对目标函数的结构化调用的薄包装函数；驱动的质量决定了模糊测试的深度——差的驱动将模糊器限制在浅层输入解析",
      "语料库和种子输入：有效示例输入的语料库通过给模糊器一个已经执行非平凡代码路径的起点来引导覆盖率；种子质量直接影响复杂解析器的首次崩溃时间",
      "消毒剂：当与 AddressSanitizer（ASan）、UndefinedBehaviourSanitizer（UBSan）和 MemorySanitizer 结合使用时，模糊测试最为有效——这些工具对二进制文件进行插桩以检测不会立即导致崩溃的内存安全违规，大幅提高 bug 发现率"
    ],
    "timeline": [
      [
        "1990",
        "Barton Miller at University of Wisconsin publishes the first fuzz testing study, randomly piping characters to UNIX utilities and finding that 25-33% crashed or hung"
      ],
      [
        "2004",
        "Michael Zalewski releases American Fuzzy Lop (AFL), a coverage-guided fuzzer that became the dominant tool for security fuzzing by dramatically outperforming random input generation"
      ],
      [
        "2016",
        "Google launches OSS-Fuzz, providing continuous free fuzzing infrastructure for critical open-source projects; libFuzzer is integrated into LLVM as the standard in-process fuzzer"
      ],
      [
        "2022",
        "Go 1.18 ships built-in fuzz testing support in the standard library; Rust cargo-fuzz and Python Atheris bring coverage-guided fuzzing to all major language ecosystems"
      ]
    ],
    "timeline_zh": [
      [
        "1990",
        "威斯康星大学的 Barton Miller 发表了首个模糊测试研究，向 UNIX 工具随机输入字符，发现 25-33% 的工具会崩溃或挂起"
      ],
      [
        "2004",
        "Michael Zalewski 发布 American Fuzzy Lop（AFL），一个覆盖率引导的模糊器，通过显著优于随机输入生成而成为安全模糊测试的主流工具"
      ],
      [
        "2016",
        "Google 启动 OSS-Fuzz，为关键开源项目提供持续免费的模糊测试基础设施；libFuzzer 作为标准进程内模糊器集成到 LLVM"
      ],
      [
        "2022",
        "Go 1.18 在标准库中内置了模糊测试支持；Rust cargo-fuzz 和 Python Atheris 将覆盖率引导的模糊测试带到所有主要语言生态系统"
      ]
    ],
    "dos": [
      "Do always run fuzzers with memory safety sanitizers enabled (AddressSanitizer, UndefinedBehaviourSanitizer) — without sanitizers, many memory corruption bugs cause no visible crash and go undetected",
      "Do minimise crashing inputs using the fuzzer's corpus minimisation tool before filing bugs — a 3-byte reproducer is far more useful for debugging than a 50KB input that triggers the same crash",
      "Do maintain a seed corpus of valid, structurally diverse example inputs and commit it to source control — seed quality has the largest single impact on time-to-discover-first-unique-bug on complex parsers",
      "Do integrate fuzzing into CI with a time-limited run (5-10 minutes) to catch regressions, supplemented by long overnight or continuous cloud-based fuzzing campaigns for deeper exploration"
    ],
    "dos_zh": [
      "始终在启用内存安全消毒剂（AddressSanitizer、UndefinedBehaviourSanitizer）的情况下运行模糊器——没有消毒剂，许多内存损坏 bug 不会导致可见崩溃而被漏检",
      "在提交 bug 之前使用模糊器的语料库最小化工具最小化崩溃输入——3 字节的复现器比触发相同崩溃的 50KB 输入对调试有用得多",
      "维护有效的、结构多样的示例输入种子语料库并提交到源码控制——种子质量对复杂解析器首次发现唯一 bug 的时间影响最大",
      "将模糊测试集成到 CI 中，进行时间限制的运行（5-10 分钟）以捕获回归，并辅以长时间的夜间或持续基于云的模糊测试活动进行更深入的探索"
    ],
    "donts": [
      "Do not fuzz without a harness that properly initialises all global and thread-local state — uninitialised state causes false positive crashes that mislead triage and waste debugging time",
      "Do not treat a crash as confirmed until you have reproduced it deterministically with the minimised input on a clean build — transient crashes from race conditions or heap ASLR interference require special handling",
      "Do not limit fuzzing to only the happy-path entry points — the most security-critical bugs are found in error handling, boundary conditions, and rarely-exercised code paths that only receive malformed inputs",
      "Do not discard crashing inputs after fixing the bug — add them to the regression corpus so future fuzzing campaigns continue from the known boundary rather than rediscovering the same fixed bug"
    ],
    "donts_zh": [
      "不要在没有正确初始化所有全局和线程本地状态的驱动情况下进行模糊测试——未初始化的状态会导致假阳性崩溃，误导分类并浪费调试时间",
      "在没有使用最小化输入在干净构建上确定性复现崩溃之前，不要将崩溃视为已确认——来自竞争条件或堆 ASLR 干扰的瞬态崩溃需要特殊处理",
      "不要将模糊测试限制在仅正常路径的入口点——最安全关键的 bug 在错误处理、边界条件和只接收畸形输入的罕见代码路径中被发现",
      "在修复 bug 后不要丢弃崩溃输入——将其添加到回归语料库，使未来的模糊测试活动从已知边界继续，而不是重新发现相同的已修复 bug"
    ],
    "case_study_company": "Google",
    "case_study": "Google launched OSS-Fuzz in 2016 to provide continuous free fuzzing infrastructure for critical open-source projects integrated into the broader software supply chain. By 2024, OSS-Fuzz had fuzzed over 1,000 open-source projects and found over 10,000 vulnerabilities and bugs, including critical CVEs in OpenSSL, libpng, FFmpeg, FreeType, and dozens of other widely deployed libraries. Google's internal fuzzing infrastructure, ClusterFuzz, runs tens of billions of test cases per day across Chrome and Android. In Chrome specifically, fuzzing has been credited with finding approximately 25% of all security bugs before they reach users. The Chrome security team maintains over 1,500 fuzz targets covering parser, codec, and IPC code, and all new Chrome security-sensitive code is required to have an associated fuzz harness as a condition of code review approval.",
    "case_study_zh": "Google 于 2016 年启动 OSS-Fuzz，为集成到更广泛软件供应链的关键开源项目提供持续免费的模糊测试基础设施。到 2024 年，OSS-Fuzz 已对 1000 多个开源项目进行了模糊测试，发现了 10000 多个漏洞和 bug，包括 OpenSSL、libpng、FFmpeg、FreeType 和数十个其他广泛部署库中的关键 CVE。Google 的内部模糊测试基础设施 ClusterFuzz 每天在 Chrome 和 Android 上运行数百亿个测试用例。在 Chrome 中，模糊测试被认为在漏洞到达用户之前发现了大约 25% 的所有安全 bug。Chrome 安全团队维护了 1500 多个覆盖解析器、编解码器和 IPC 代码的模糊目标，所有新的 Chrome 安全敏感代码都需要有相关的模糊驱动作为代码审查批准的条件。",
    "when_not_to_use": [
      "Pure business logic and domain rule validation where all inputs are structured, pre-validated, and the attack surface consists of logical correctness rather than parsing or memory safety",
      "Systems written in memory-safe languages (Python, JavaScript, Java, Go with GC) where the primary vulnerability class being targeted is not applicable and fuzzing discovers logic bugs more slowly than directed testing",
      "UI and end-to-end workflows where the state space is too large and input space too structured for random mutation to exercise meaningful code paths — BDD and scenario-based testing are more appropriate",
      "Early-stage prototypes where the code structure changes frequently enough that maintaining fuzz harnesses and seed corpora represents overhead disproportionate to code stability"
    ],
    "when_not_to_use_zh": [
      "纯业务逻辑和领域规则验证，其中所有输入都是结构化、预验证的，攻击面由逻辑正确性而非解析或内存安全构成",
      "使用内存安全语言（Python、JavaScript、带 GC 的 Java、Go）编写的系统，其中针对的主要漏洞类别不适用，模糊测试发现逻辑 bug 的速度比定向测试慢",
      "状态空间太大且输入空间太结构化，随机变异无法执行有意义代码路径的 UI 和端到端工作流——BDD 和基于场景的测试更为合适",
      "代码结构变化频繁，维护模糊驱动和种子语料库的开销与代码稳定性不成比例的早期原型"
    ],
    "adopters": [
      "Google (OSS-Fuzz, ClusterFuzz)",
      "Microsoft (Security Risk Detection)",
      "Apple (libFuzzer in Xcode)",
      "Mozilla (Firefox fuzzing)",
      "OpenSSF",
      "LLVM/Clang project"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "security",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Miller, B., Fredriksen, L., & So, B. (1990). \"An Empirical Study of the Reliability of UNIX Utilities\". Communications of the ACM, 33(12), 32-44.",
    "secondary_sources": [
      "Zalewski, M. (2015). \"American Fuzzy Lop Technical Details\". lcamtuf.coredump.cx.",
      "Serebryany, K. (2017). \"libFuzzer: A Library for Coverage-Guided Fuzz Testing\". llvm.org.",
      "Google (2016). \"OSS-Fuzz: Continuous Fuzzing for Open Source Software\". github.com/google/oss-fuzz."
    ],
    "typed_relations": [
      {
        "slug": "continuous-testing",
        "type": "complement"
      },
      {
        "slug": "property-based-testing",
        "type": "related"
      },
      {
        "slug": "chaos-engineering",
        "type": "related"
      }
    ]
  },
  {
    "id": 302,
    "name": "Accessibility Testing (WCAG)",
    "name_zh": "无障碍测试（WCAG）",
    "slug": "accessibility-testing-wcag",
    "category": "quality",
    "desc": "Systematic testing approach combining automated scanning and manual evaluation to verify that digital products comply with WCAG accessibility guidelines and are usable by people with disabilities.",
    "desc_zh": "结合自动化扫描和手动评估的系统性测试方法，用于验证数字产品符合 WCAG 无障碍指南且对残障人士可用。",
    "steps": [
      "Run automated accessibility scanning tools (axe-core, Lighthouse, WAVE) during development and in CI to detect WCAG failures that are automatable — contrast ratios, missing ARIA labels, keyboard trap detection, and image alt text",
      "Supplement automated scanning with manual keyboard navigation testing: verify that all interactive elements are reachable by Tab/Shift-Tab, that focus order is logical, and that no keyboard traps exist",
      "Test with screen readers (NVDA and JAWS on Windows, VoiceOver on macOS and iOS, TalkBack on Android) to verify that page structure, headings, forms, and dynamic content updates are announced correctly",
      "Conduct user testing with participants who have disabilities including visual, motor, cognitive, and hearing impairments — automated tools detect only 30-40% of WCAG failures; user testing discovers the remainder",
      "Document findings against specific WCAG 2.1 success criteria, prioritise by severity (blocker versus advisory), fix and retest, and integrate automated checks as a permanent CI gate to prevent regressions"
    ],
    "steps_zh": [
      "在开发期间和 CI 中运行自动化无障碍扫描工具（axe-core、Lighthouse、WAVE）以检测可自动化的 WCAG 失败——对比度比率、缺失 ARIA 标签、键盘陷阱检测和图像替代文本",
      "用手动键盘导航测试补充自动化扫描：验证所有交互元素可通过 Tab/Shift-Tab 到达，焦点顺序合理，且不存在键盘陷阱",
      "使用屏幕阅读器（Windows 上的 NVDA 和 JAWS、macOS 和 iOS 上的 VoiceOver、Android 上的 TalkBack）测试，验证页面结构、标题、表单和动态内容更新是否正确播报",
      "与包括视觉、运动、认知和听觉障碍参与者进行用户测试——自动化工具仅能检测 30-40% 的 WCAG 失败；用户测试发现其余问题",
      "对照特定的 WCAG 2.1 成功标准记录发现，按严重性（阻塞性与建议性）优先处理，修复并重新测试，并将自动化检查作为永久 CI 关卡以防止回退"
    ],
    "ai_relevant": false,
    "viz_type": "pyramid",
    "viz_labels": [
      "A",
      "AA",
      "AAA"
    ],
    "viz_labels_zh": [
      "A级",
      "AA级",
      "AAA级"
    ],
    "related": [
      "continuous-testing",
      "testing-trophy",
      "bdd"
    ],
    "tags": [
      "accessibility",
      "wcag",
      "a11y",
      "screen-reader",
      "inclusive-design",
      "compliance"
    ],
    "origin_author": "W3C WAI",
    "origin_source": "W3C Web Accessibility Initiative (1999). \"Web Content Accessibility Guidelines 1.0\". w3.org/TR/WCAG10/. Current version: WCAG 2.2 (2023). w3.org/TR/WCAG22/.",
    "origin_source_zh": "W3C Web 无障碍倡议（1999）.「Web 内容无障碍指南 1.0」. w3.org/TR/WCAG10/. 当前版本：WCAG 2.2（2023）. w3.org/TR/WCAG22/.",
    "complexity": "intermediate",
    "when_to_use": [
      "All public-facing digital products and services, especially those subject to accessibility legislation (ADA in the US, EN 301 549 in the EU, AODA in Canada) where WCAG compliance is a legal requirement",
      "Products serving users in sectors with higher-than-average rates of disability — government services, healthcare, education, financial services — where accessibility failures directly exclude vulnerable populations",
      "Design system and component library development where embedding accessibility testing early prevents downstream WCAG regressions across all consuming applications",
      "Continuous delivery pipelines where automated axe-core scanning in CI provides ongoing regression protection between manual audit cycles"
    ],
    "when_to_use_zh": [
      "所有面向公众的数字产品和服务，特别是受无障碍立法约束的产品（美国 ADA、欧盟 EN 301 549、加拿大 AODA），其中 WCAG 合规性是法律要求",
      "服务于残障率高于平均水平行业用户的产品——政府服务、医疗保健、教育、金融服务——无障碍失败直接排除弱势群体",
      "设计系统和组件库开发，在早期嵌入无障碍测试可防止所有消费方应用出现下游 WCAG 回退",
      "持续交付流水线，CI 中的自动化 axe-core 扫描在手动审计周期之间提供持续的回退保护"
    ],
    "core_concepts": [
      "POUR Principles: WCAG 2.x is organised around four principles — Perceivable (information must be presentable in ways users can perceive), Operable (UI components must be navigable), Understandable (content and operation must be comprehensible), Robust (content must be parsable by assistive technologies)",
      "Conformance Levels: WCAG defines three levels — Level A (minimum, most critical failures), Level AA (standard compliance target mandated by most regulations), Level AAA (enhanced, not required for full site conformance) — most legal requirements reference AA",
      "Automated vs Manual Coverage: automated tools reliably detect approximately 30-40% of WCAG failures; the remaining 60-70% require manual testing, screen reader verification, and user testing because they involve subjective usability judgment that tools cannot make",
      "Accessible Name and Description: the accessible name is what assistive technology announces for an interactive element — computed from label elements, aria-label, aria-labelledby, or button text; missing or incorrect accessible names are the single most common WCAG failure category"
    ],
    "core_concepts_zh": [
      "POUR 原则：WCAG 2.x 围绕四个原则组织——可感知（信息必须以用户可感知的方式呈现）、可操作（UI 组件必须可导航）、可理解（内容和操作必须可理解）、健壮性（内容必须可被辅助技术解析）",
      "合规级别：WCAG 定义了三个级别——A 级（最低要求，最关键的失败）、AA 级（大多数法规要求的标准合规目标）、AAA 级（增强级，不要求整个网站合规）——大多数法律要求参考 AA 级",
      "自动化与手动覆盖：自动化工具可靠地检测约 30-40% 的 WCAG 失败；其余 60-70% 需要手动测试、屏幕阅读器验证和用户测试，因为它们涉及工具无法做出的主观可用性判断",
      "可访问名称和描述：可访问名称是辅助技术为交互元素播报的内容——从 label 元素、aria-label、aria-labelledby 或按钮文本计算得出；缺失或不正确的可访问名称是最常见的 WCAG 失败类别"
    ],
    "timeline": [
      [
        "1999",
        "W3C WAI publishes WCAG 1.0, the first international standard for web accessibility, establishing the principle that web content should be perceivable and operable by users with disabilities"
      ],
      [
        "2008",
        "WCAG 2.0 is published, replacing technology-specific guidelines with the technology-neutral POUR principles framework and three conformance levels (A, AA, AAA)"
      ],
      [
        "2018",
        "WCAG 2.1 adds 17 new success criteria targeting mobile accessibility, low vision, and cognitive disabilities; becomes the baseline for ADA lawsuits and EU EN 301 549 standard"
      ],
      [
        "2023",
        "WCAG 2.2 is published with 9 new success criteria including Focus Appearance and Accessible Authentication; axe-core, Lighthouse, and browser DevTools reach near-universal adoption for automated scanning"
      ]
    ],
    "timeline_zh": [
      [
        "1999",
        "W3C WAI 发布 WCAG 1.0，这是首个国际 Web 无障碍标准，确立了 Web 内容应对残障用户可感知和可操作的原则"
      ],
      [
        "2008",
        "WCAG 2.0 发布，用技术中立的 POUR 原则框架和三个合规级别（A、AA、AAA）取代了技术特定的指南"
      ],
      [
        "2018",
        "WCAG 2.1 新增 17 个针对移动无障碍、低视力和认知障碍的成功标准；成为 ADA 诉讼和欧盟 EN 301 549 标准的基准"
      ],
      [
        "2023",
        "WCAG 2.2 发布，新增 9 个成功标准，包括焦点外观和无障碍身份验证；axe-core、Lighthouse 和浏览器开发工具在自动化扫描方面达到近乎普遍的采用"
      ]
    ],
    "dos": [
      "Do integrate axe-core or Lighthouse CI as a mandatory CI gate that blocks merges on WCAG AA violations — catching regressions at the PR level costs seconds, fixing them post-release costs hours",
      "Do write automated component tests that verify accessible names for all interactive elements using testing-library queries like getByRole and getByLabelText — role-based queries that match screen reader semantics",
      "Do test with real screen readers on real devices rather than relying solely on browser accessibility tree inspection — virtual buffer mode in JAWS and NVDA produces different reading experiences than the raw DOM accessibility tree",
      "Do include users with disabilities in usability testing at least once per major product cycle — lived experience finds usability barriers that WCAG checklists do not capture, particularly for cognitive and motor impairments"
    ],
    "dos_zh": [
      "将 axe-core 或 Lighthouse CI 集成为强制性 CI 关卡，阻止因 WCAG AA 违规的合并——在 PR 级别捕获回退只需几秒，发布后修复则需数小时",
      "编写自动化组件测试，使用 testing-library 的 getByRole 和 getByLabelText 等查询验证所有交互元素的可访问名称——基于角色的查询与屏幕阅读器语义匹配",
      "在真实设备上使用真实屏幕阅读器进行测试，而不仅依赖浏览器无障碍树检查——JAWS 和 NVDA 中的虚拟缓冲区模式比原始 DOM 无障碍树产生不同的阅读体验",
      "在每个主要产品周期中至少进行一次包含残障用户的可用性测试——亲身经历发现 WCAG 清单未能捕捉的可用性障碍，尤其是认知和运动障碍"
    ],
    "donts": [
      "Do not treat automated scan results as a complete accessibility audit — a clean axe-core report means approximately 30-40% of WCAG criteria are satisfied, not that the product is accessible",
      "Do not add ARIA attributes to fix accessibility tree issues without understanding their semantics — incorrectly applied ARIA (e.g., role='button' on a div without keyboard event handling) can make accessibility worse than having no ARIA at all",
      "Do not defer accessibility testing to a final QA phase — accessibility issues in component structure and semantic HTML are architectural decisions that are expensive to retrofit; they must be addressed in design and initial implementation",
      "Do not assume visual design review covers accessibility — colour contrast, text sizing, and focus indicators are the subset visible to sighted reviewers; the majority of WCAG criteria relate to programmatic semantics invisible to visual inspection"
    ],
    "donts_zh": [
      "不要将自动化扫描结果视为完整的无障碍审计——干净的 axe-core 报告意味着大约 30-40% 的 WCAG 标准得到满足，而不是产品完全可访问",
      "不要在不了解语义的情况下添加 ARIA 属性来修复无障碍树问题——错误应用的 ARIA（如在没有键盘事件处理的 div 上使用 role='button'）可能使无障碍性比完全没有 ARIA 更差",
      "不要将无障碍测试推迟到最终 QA 阶段——组件结构和语义 HTML 中的无障碍问题是架构决策，事后修复成本高昂；必须在设计和初始实现阶段解决",
      "不要假设视觉设计审查涵盖了无障碍性——颜色对比度、文字大小和焦点指示器是视力正常的审查者可见的子集；大多数 WCAG 标准涉及视觉检查不可见的程序语义"
    ],
    "case_study_company": "BBC",
    "case_study": "The BBC has one of the most mature accessibility testing programmes in the media industry, driven by a public broadcasting mandate and UK Equality Act obligations. Their accessibility approach combines automated testing (axe-core integrated into their React component library tests and Playwright E2E suite), manual screen reader testing on every major page type with JAWS, NVDA, and VoiceOver as part of their release checklist, and a dedicated accessibility team of specialists who conduct deep audits. The BBC published their Mobile Accessibility Guidelines in 2015 as an open standard extending WCAG for mobile contexts. Their BBC iPlayer video platform underwent a major accessibility overhaul in 2019, adding audio description, signed content, and subtitle customisation after user testing with deaf and visually impaired users revealed that the existing subtitle implementation failed WCAG 1.4.8 for reflow and had screen reader navigation issues in the episode selector. Post-remediation, BBC reported a measurable increase in iPlayer usage from users identifying as disabled.",
    "case_study_zh": "BBC 在媒体行业拥有最成熟的无障碍测试项目之一，这得益于公共广播授权和英国《平等法》义务。他们的无障碍方法结合了自动化测试（axe-core 集成到其 React 组件库测试和 Playwright E2E 套件中）、对每种主要页面类型使用 JAWS、NVDA 和 VoiceOver 进行手动屏幕阅读器测试（作为发布清单的一部分），以及一个专门的无障碍专家团队进行深度审计。BBC 于 2015 年将其移动无障碍指南作为扩展 WCAG 移动场景的开放标准发布。他们的 BBC iPlayer 视频平台于 2019 年进行了重大无障碍改造，在对聋人和视障用户进行用户测试后添加了音频描述、手语内容和字幕定制功能，测试揭示现有字幕实现在回流方面未能满足 WCAG 1.4.8，且片集选择器中存在屏幕阅读器导航问题。修复后，BBC 报告了来自残障用户的 iPlayer 使用量可测量的增长。",
    "when_not_to_use": [
      "Internal tools with a strictly controlled user base of non-disabled employees where the cost of full WCAG AA compliance exceeds the business value — though WCAG AA is recommended as a baseline for all digital products",
      "Proof-of-concept prototypes and throwaway code where accessibility investment would be wasted before the prototype is validated and discarded",
      "Highly specialised technical tooling (developer IDEs, 3D modelling applications, specialised scientific visualisations) where WCAG AA compliance is aspirational but the interaction model may not fully map to current WCAG success criteria",
      "As a one-time compliance audit substitute for ongoing testing — accessibility is not a binary achieved state; it requires continuous testing integration to prevent regressions with every UI change"
    ],
    "when_not_to_use_zh": [
      "严格控制非残障员工用户群的内部工具，完整 WCAG AA 合规成本超过业务价值——尽管推荐将 WCAG AA 作为所有数字产品的基准",
      "概念验证原型和一次性代码，在原型验证和丢弃之前无障碍投资会被浪费",
      "高度专业化的技术工具（开发者 IDE、3D 建模应用、专业科学可视化），WCAG AA 合规是一种期望，但交互模型可能无法完全映射到当前 WCAG 成功标准",
      "作为持续测试的一次性合规审计替代品——无障碍不是二元的已实现状态；它需要持续的测试集成以防止每次 UI 变更时出现回退"
    ],
    "adopters": [
      "BBC",
      "GOV.UK (UK Government Digital Service)",
      "Deque Systems (axe-core)",
      "Microsoft",
      "Apple",
      "Salesforce"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "usability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "W3C WAI (2023). \"Web Content Accessibility Guidelines (WCAG) 2.2\". w3.org/TR/WCAG22/.",
    "secondary_sources": [
      "Deque Systems (2023). \"axe-core: Accessibility Engine for Automated Web UI Testing\". github.com/dequelabs/axe-core.",
      "UK Government Digital Service (2020). \"Testing for accessibility\". gov.uk/service-manual/helping-people-to-use-your-service/testing-for-accessibility.",
      "Faulkner, S. et al. (2019). \"WCAG 2.1 Understanding Docs\". w3.org/WAI/WCAG21/Understanding/."
    ],
    "typed_relations": [
      {
        "slug": "continuous-testing",
        "type": "complement"
      },
      {
        "slug": "testing-trophy",
        "type": "complement"
      },
      {
        "slug": "bdd",
        "type": "complement"
      }
    ]
  },
  {
    "id": 59,
    "name": "Blue-Green Deployment",
    "name_zh": "蓝绿部署",
    "slug": "blue-green-deployment",
    "category": "deployment",
    "desc": "Zero-downtime releases via two identical prod environments",
    "desc_zh": "通过两套相同生产环境实现零停机发布",
    "steps": [
      "Maintain two identical production environments: Blue (live) and Green (idle)",
      "Deploy new version to the idle Green environment and run full test suite",
      "Perform smoke tests and validation on Green without affecting live traffic",
      "Switch router/load balancer to redirect all traffic from Blue to Green",
      "Keep Blue on standby for instant rollback; decommission after stability confirmed"
    ],
    "steps_zh": [
      "维护两套相同的生产环境：蓝（当前运行）和绿（闲置）",
      "将新版本部署到闲置的绿色环境并执行完整测试套件",
      "在绿色环境上执行冒烟测试和验证，不影响线上流量",
      "切换路由器/负载均衡器，将所有流量从蓝色切换到绿色",
      "保留蓝色环境待命以便快速回滚，稳定后再下线"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Blue Env",
      "Switch",
      "Green Env",
      "Rollback"
    ],
    "viz_labels_zh": [
      "蓝环境",
      "流量切换",
      "绿环境",
      "快速回滚"
    ],
    "related": [
      "canary-deployment",
      "feature-flags",
      "gitops"
    ],
    "tags": [
      "deployment",
      "zero-downtime",
      "blue-green",
      "rollback"
    ],
    "origin_author": "Daniel Quinlan / early web ops community, ~2005",
    "origin_source": "Continuous Delivery: Reliable Software Releases through Build, Test, and Deployment Automation (Jez Humble & David Farley, 2010)",
    "origin_source_zh": "《持续交付：发布可靠软件的系统方法》（Jez Humble & David Farley，2010）",
    "complexity": "intermediate",
    "when_to_use": [
      "Applications requiring zero-downtime deployments with instant rollback capability",
      "Regulated environments where full pre-production validation is mandatory before traffic switch",
      "Monolithic applications where partial rollout (canary) is impractical",
      "Systems where database schema changes are backward-compatible"
    ],
    "when_to_use_zh": [
      "需要零停机部署且具备即时回滚能力的应用",
      "受监管环境中流量切换前需强制完成全面预生产验证的场景",
      "单体应用中部分发布（金丝雀）不可行的场景",
      "数据库模式变更向后兼容的系统"
    ],
    "core_concepts": [
      "Environment Parity: Blue and Green environments must be identical in infrastructure, configuration, and capacity to ensure reliable switchover",
      "Atomic Switchover: Traffic routing is changed at the load balancer level, providing an all-or-nothing cutover with no partial states",
      "Instant Rollback: The previous environment remains intact and warm, allowing a sub-second rollback by reversing the router switch",
      "Idle Cost Tradeoff: Maintaining two full production environments doubles infrastructure cost during deployment windows"
    ],
    "core_concepts_zh": [
      "环境一致性：蓝绿两套环境在基础设施、配置和容量上必须完全相同，以确保切换可靠",
      "原子切换：在负载均衡器层面切换流量路由，实现全有或全无的切换，不存在中间状态",
      "即时回滚：旧环境保持完好且处于热备状态，通过反转路由即可在亚秒级完成回滚",
      "闲置成本权衡：在部署窗口期维护两套完整生产环境，基础设施成本翻倍"
    ],
    "timeline": [
      [
        "2005",
        "Blue-green pattern emerges in early web operations for mainframe-era batch cutover modernization"
      ],
      [
        "2010",
        "Jez Humble and David Farley formalize blue-green deployment in Continuous Delivery"
      ],
      [
        "2013",
        "AWS Elastic Beanstalk adds native blue-green deployment support via environment URL swap"
      ],
      [
        "2017",
        "Kubernetes ecosystem tooling (Argo Rollouts, Flagger) makes blue-green declarative and automated"
      ],
      [
        "2022",
        "Cloud-native platforms (AWS CodeDeploy, Azure Deployment Slots) offer managed blue-green as a first-class feature"
      ]
    ],
    "timeline_zh": [
      [
        "2005",
        "蓝绿模式在早期Web运维中出现，用于替代大型机时代的批量切换方式"
      ],
      [
        "2010",
        "Jez Humble和David Farley在《持续交付》一书中正式定义蓝绿部署"
      ],
      [
        "2013",
        "AWS Elastic Beanstalk通过环境URL交换原生支持蓝绿部署"
      ],
      [
        "2017",
        "Kubernetes生态工具（Argo Rollouts、Flagger）使蓝绿部署实现声明式自动化"
      ],
      [
        "2022",
        "云原生平台（AWS CodeDeploy、Azure部署槽）将蓝绿部署作为一等功能提供"
      ]
    ],
    "dos": [
      "Do run comprehensive smoke tests on the Green environment before switching traffic, because post-switch failures are visible to all users instantly",
      "Do keep database migrations backward-compatible so both Blue and Green can operate against the same schema simultaneously",
      "Do automate the switchover and rollback process through CI/CD pipelines to eliminate human error under pressure",
      "Do monitor the new environment closely for at least one full traffic cycle before decommissioning the old one"
    ],
    "dos_zh": [
      "务必在切换流量前对绿色环境执行全面冒烟测试，因为切换后的故障会立即影响所有用户",
      "务必保持数据库迁移向后兼容，使蓝绿两套环境能同时操作同一数据库模式",
      "务必通过CI/CD流水线自动化切换和回滚流程，消除压力下的人为失误",
      "务必在至少一个完整流量周期内密切监控新环境，之后再下线旧环境"
    ],
    "donts": [
      "Don't allow configuration drift between Blue and Green environments, because asymmetry causes false-positive test results",
      "Don't perform destructive database migrations during blue-green switches, because rollback becomes impossible if the schema is incompatible",
      "Don't skip warming up the Green environment's caches and connections, because cold-start latency spikes will degrade user experience",
      "Don't leave the idle environment running indefinitely without cost review, because it doubles your infrastructure spend"
    ],
    "donts_zh": [
      "不要让蓝绿环境之间出现配置漂移，因为不对称会导致测试结果出现假阳性",
      "不要在蓝绿切换过程中执行破坏性数据库迁移，因为模式不兼容会导致无法回滚",
      "不要跳过绿色环境的缓存和连接预热，因为冷启动延迟峰值会降低用户体验",
      "不要让闲置环境无限期运行而不审查成本，因为这会使基础设施开支翻倍"
    ],
    "case_study_company": "LinkedIn",
    "case_study": "LinkedIn adopted blue-green deployments for its main site in 2013 to eliminate multi-hour maintenance windows during weekly releases. By maintaining two identical production tiers behind their load balancers, they reduced deployment downtime from hours to under 30 seconds per release. This pattern also allowed them to validate each release with production traffic patterns before committing to the switchover.",
    "case_study_zh": "LinkedIn于2013年在其主站采用蓝绿部署，以消除每周发布时长达数小时的维护窗口。通过在负载均衡器后维护两套相同的生产层，他们将每次发布的部署停机时间从数小时缩短至30秒以内。该模式还允许他们在正式切换前使用生产流量模式验证每个版本。",
    "when_not_to_use": [
      "Microservices with independent release cadences where coordinating two full environments per service is cost-prohibitive",
      "Applications with non-backward-compatible database migrations that prevent both environments from sharing the same data layer",
      "Teams that need gradual percentage-based traffic shifting (use canary deployment instead)",
      "Stateful workloads where in-memory session data cannot survive an environment switch"
    ],
    "when_not_to_use_zh": [
      "拥有独立发布节奏的微服务——为每个服务维护两套完整环境在成本上不可承受",
      "数据库迁移不向后兼容导致两套环境无法共享同一数据层的应用",
      "需要基于百分比逐步切换流量的团队（应使用金丝雀发布替代）",
      "内存中会话数据无法在环境切换中存活的有状态工作负载"
    ],
    "adopters": [
      "LinkedIn",
      "Amazon",
      "Netflix",
      "Etsy",
      "Facebook"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Jez Humble and David Farley (2010). \"Continuous Delivery: Reliable Software Releases through Build, Test, and Deployment Automation\". Addison-Wesley.",
    "secondary_sources": [
      "Martin Fowler (2010). \"BlueGreenDeployment\". martinfowler.com.",
      "Gene Kim et al. (2016). \"The DevOps Handbook\". IT Revolution Press."
    ],
    "typed_relations": [
      {
        "slug": "canary-deployment",
        "type": "alternative"
      },
      {
        "slug": "feature-flags",
        "type": "complement"
      },
      {
        "slug": "gitops",
        "type": "complement"
      }
    ]
  },
  {
    "id": 60,
    "name": "Canary Deployment",
    "name_zh": "金丝雀发布",
    "slug": "canary-deployment",
    "category": "deployment",
    "desc": "Gradually roll out changes to a small user subset first",
    "desc_zh": "先向小部分用户灰度发布变更，再逐步全量推送",
    "steps": [
      "Deploy the new version alongside the stable version in production",
      "Route a small percentage (1-5%) of real traffic to the canary instance",
      "Monitor key metrics: error rates, latency, and business KPIs for the canary",
      "Incrementally increase traffic percentage if metrics remain healthy",
      "Promote canary to 100% or roll back automatically based on threshold alerts"
    ],
    "steps_zh": [
      "在生产环境中将新版本与稳定版本并行部署",
      "将少量真实流量（1-5%）路由到金丝雀实例",
      "监控金丝雀的关键指标：错误率、延迟及业务KPI",
      "若指标正常则逐步提高流量比例",
      "依据阈值告警决定将金丝雀全量推送或自动回滚"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Canary",
      "Monitor",
      "Expand",
      "Full Deploy"
    ],
    "viz_labels_zh": [
      "金丝雀",
      "监控",
      "扩量",
      "全量发布"
    ],
    "related": [
      "blue-green-deployment",
      "feature-flags",
      "dora-metrics"
    ],
    "tags": [
      "deployment",
      "canary",
      "gradual-rollout",
      "progressive-delivery"
    ],
    "origin_author": "Google / early site reliability engineering teams, ~2004",
    "origin_source": "Inspired by coal mining canary practice; formalized in Google's SRE practices and later in Site Reliability Engineering (Betsy Beyer et al., 2016)",
    "origin_source_zh": "灵感源于煤矿金丝雀实践；由Google SRE实践正式化，后收录于《SRE：Google运维解密》（Betsy Beyer等，2016）",
    "complexity": "intermediate",
    "when_to_use": [
      "High-traffic services where a full rollout failure would impact millions of users",
      "Deployments requiring real production traffic validation before full promotion",
      "Services with well-defined SLIs that can be automatically compared between canary and baseline",
      "Teams practicing progressive delivery and wanting automated rollback based on metrics"
    ],
    "when_to_use_zh": [
      "高流量服务——全量发布失败会影响数百万用户",
      "需要在全量推广前通过真实生产流量进行验证的部署",
      "拥有明确定义的SLI、可在金丝雀与基线之间自动比较的服务",
      "实践渐进式交付并希望基于指标自动回滚的团队"
    ],
    "core_concepts": [
      "Progressive Traffic Shifting: Gradually increase the percentage of traffic routed to the new version in controlled increments (1% -> 5% -> 25% -> 100%)",
      "Automated Analysis: Use statistical comparison of canary vs. baseline metrics to make promotion/rollback decisions without human judgment",
      "Blast Radius Containment: Only a small subset of users is exposed to potential issues, limiting the impact of defective releases",
      "Metric-Driven Promotion: Deployment progression is gated by objective health signals rather than time-based schedules"
    ],
    "core_concepts_zh": [
      "渐进式流量切换：以受控的增量方式逐步提高路由到新版本的流量百分比（1% -> 5% -> 25% -> 100%）",
      "自动化分析：通过统计对比金丝雀与基线的指标数据，无需人工判断即可做出晋升/回滚决策",
      "爆炸半径控制：仅有少量用户暴露于潜在问题，限制了缺陷发布的影响范围",
      "指标驱动晋升：部署推进由客观健康信号把关，而非基于时间的排程"
    ],
    "timeline": [
      [
        "2004",
        "Google implements canary analysis as part of internal deployment tooling for web search"
      ],
      [
        "2013",
        "Netflix develops Kayenta, an open-source automated canary analysis tool"
      ],
      [
        "2017",
        "Spinnaker integrates automated canary analysis as a first-class pipeline stage"
      ],
      [
        "2019",
        "Flagger brings canary deployment automation to Kubernetes with Istio and Linkerd integration"
      ],
      [
        "2022",
        "Argo Rollouts matures canary strategy with customizable traffic management and analysis templates"
      ]
    ],
    "timeline_zh": [
      [
        "2004",
        "Google在内部部署工具中为Web搜索服务实现金丝雀分析"
      ],
      [
        "2013",
        "Netflix开发Kayenta——一款开源自动化金丝雀分析工具"
      ],
      [
        "2017",
        "Spinnaker将自动化金丝雀分析集成为一等流水线阶段"
      ],
      [
        "2019",
        "Flagger通过集成Istio和Linkerd将金丝雀部署自动化引入Kubernetes"
      ],
      [
        "2022",
        "Argo Rollouts通过可定制的流量管理和分析模板完善金丝雀策略"
      ]
    ],
    "dos": [
      "Do define clear success/failure metrics and thresholds before starting the canary, because ad-hoc evaluation leads to subjective decisions",
      "Do ensure canary and baseline receive comparable traffic samples to avoid skewed comparisons",
      "Do automate the rollback process so that metric breaches trigger instant traffic drain from the canary",
      "Do allow sufficient bake time at each traffic tier to capture slow-burning issues like memory leaks"
    ],
    "dos_zh": [
      "务必在启动金丝雀前定义清晰的成功/失败指标和阈值，因为临时评估会导致主观判断",
      "务必确保金丝雀和基线接收可比的流量样本，以避免比较偏差",
      "务必自动化回滚流程，使指标突破阈值时立即从金丝雀排空流量",
      "务必在每个流量层级留出足够的烘烤时间，以捕获内存泄漏等缓慢暴露的问题"
    ],
    "donts": [
      "Don't route all sticky sessions to the canary, because this biases the sample and hides issues that affect fresh users",
      "Don't promote based solely on absence of errors -- also check latency percentiles and business metrics, because silent degradation is common",
      "Don't run canary analysis for too short a period, because some defects only manifest under sustained load or time-dependent conditions",
      "Don't ignore infrastructure-level differences between canary and baseline instances, because CPU/memory asymmetry skews results"
    ],
    "donts_zh": [
      "不要将所有粘性会话路由到金丝雀，因为这会使样本产生偏差并隐藏影响新用户的问题",
      "不要仅凭没有错误就晋升——还需检查延迟百分位和业务指标，因为无声劣化很常见",
      "不要将金丝雀分析运行时间设置得过短，因为某些缺陷只在持续负载或时间相关条件下才会显现",
      "不要忽视金丝雀和基线实例之间的基础设施差异，因为CPU/内存不对称会歪曲结果"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix built and open-sourced Kayenta in 2017 as their automated canary analysis platform integrated with Spinnaker. Every production deployment at Netflix goes through canary analysis, where a new build serves a small percentage of traffic while Kayenta statistically compares hundreds of metrics against a baseline. This system catches roughly 80% of production-impacting issues before they reach general availability, enabling Netflix to deploy hundreds of times per day with confidence.",
    "case_study_zh": "Netflix于2017年构建并开源了Kayenta，作为与Spinnaker集成的自动化金丝雀分析平台。Netflix的每次生产部署都要经过金丝雀分析，新构建服务于少量流量，同时Kayenta将数百个指标与基线进行统计比较。该系统在问题到达全量发布前捕获约80%的生产影响缺陷，使Netflix每天能够自信地部署数百次。",
    "when_not_to_use": [
      "Low-traffic services where the canary subset is too small to produce statistically significant metric comparisons",
      "Deployments involving breaking API changes that cannot coexist with the previous version",
      "Systems lacking sufficient observability to compare canary and baseline health signals",
      "Batch processing systems where there is no continuous request stream to analyze"
    ],
    "when_not_to_use_zh": [
      "低流量服务——金丝雀子集过小，无法产生具有统计显著性的指标比较",
      "涉及无法与前一版本共存的破坏性API变更的部署",
      "缺乏足够可观测性来比较金丝雀与基线健康信号的系统",
      "没有可供分析的持续请求流的批处理系统"
    ],
    "adopters": [
      "Netflix",
      "Google",
      "Facebook",
      "Uber",
      "Lyft"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Betsy Beyer, Chris Jones, Jennifer Petoff, and Niall Richard Murphy (2016). \"Site Reliability Engineering: How Google Runs Production Systems\". O'Reilly Media.",
    "secondary_sources": [
      "Jez Humble and David Farley (2010). \"Continuous Delivery: Reliable Software Releases through Build, Test, and Deployment Automation\". Addison-Wesley.",
      "Danilo Sato, Arjan Schaaf, and Daniel Bryant (2019). \"Canary Releases\". martinfowler.com."
    ],
    "typed_relations": [
      {
        "slug": "blue-green-deployment",
        "type": "alternative"
      },
      {
        "slug": "feature-flags",
        "type": "complement"
      },
      {
        "slug": "dora-metrics",
        "type": "complement"
      }
    ]
  },
  {
    "id": 61,
    "name": "Feature Flags",
    "name_zh": "功能开关",
    "slug": "feature-flags",
    "category": "deployment",
    "desc": "Decouple code deployment from feature release via toggles",
    "desc_zh": "通过开关将代码部署与功能发布解耦",
    "steps": [
      "Identify the feature to gate and define the flag's targeting rules (user, segment, env)",
      "Wrap new feature code behind a conditional flag check in the codebase",
      "Deploy code to production with the flag disabled; verify no impact",
      "Gradually enable the flag for internal users, then beta users, then general audience",
      "Remove the flag and dead code paths once the feature is fully rolled out"
    ],
    "steps_zh": [
      "确定要管控的功能，定义开关的目标规则（用户、分组、环境）",
      "在代码库中用条件判断将新功能代码包裹在开关检查之后",
      "以开关关闭状态部署到生产环境，验证无影响",
      "依次为内部用户、Beta用户、全体用户逐步开启开关",
      "功能完全上线后移除开关及已废弃的代码路径"
    ],
    "ai_relevant": true,
    "viz_type": "tree",
    "viz_labels": [
      "Flag Store",
      "Targeting",
      "Rollout",
      "Override"
    ],
    "viz_labels_zh": [
      "标志仓库",
      "定向规则",
      "发布比例",
      "强制覆盖"
    ],
    "related": [
      "canary-deployment",
      "blue-green-deployment",
      "branch-by-abstraction"
    ],
    "tags": [
      "feature-flags",
      "toggles",
      "release-management",
      "progressive-delivery"
    ],
    "origin_author": "Flickr / early continuous deployment pioneers, ~2009",
    "origin_source": "Feature Toggles pattern described by Martin Fowler (2010); popularized by Flickr's Flipping Out deployment culture",
    "origin_source_zh": "Martin Fowler于2010年描述的Feature Toggles模式；由Flickr的Flipping Out部署文化推广",
    "complexity": "beginner",
    "when_to_use": [
      "Trunk-based development workflows where all developers commit to the main branch and features must be hidden until ready",
      "A/B testing and experimentation where different user segments receive different experiences",
      "Gradual feature rollouts that need to be decoupled from code deployment cadence",
      "Emergency kill-switch scenarios where a problematic feature must be disabled without redeploying"
    ],
    "when_to_use_zh": [
      "主干开发工作流中所有开发者提交到主分支，功能需隐藏直至就绪",
      "A/B测试和实验中不同用户群体接收不同体验的场景",
      "需要与代码部署节奏解耦的渐进式功能发布",
      "需要在不重新部署的情况下禁用问题功能的紧急熔断场景"
    ],
    "core_concepts": [
      "Release Toggle: A short-lived flag used to hide incomplete features in production until they are ready for launch",
      "Experiment Toggle: A flag used for A/B testing, where different user segments receive different feature variants to measure impact",
      "Ops Toggle: A long-lived flag used as a circuit breaker to gracefully degrade features under load or during incidents",
      "Permission Toggle: A flag that gates features based on user entitlements, subscriptions, or role-based access",
      "Flag Debt: The technical debt accumulated when feature flags are not cleaned up after rollout, leading to dead code paths and complexity"
    ],
    "core_concepts_zh": [
      "发布开关：短期标记，用于在功能准备就绪前隐藏生产中的未完成功能",
      "实验开关：用于A/B测试的标记，不同用户群体接收不同功能变体以衡量影响",
      "运维开关：长期标记，在负载过高或故障期间作为断路器优雅降级功能",
      "权限开关：基于用户权限、订阅或角色的功能访问控制标记",
      "开关债务：功能开关在发布后未及时清理而累积的技术债，导致死代码路径和复杂度增加"
    ],
    "timeline": [
      [
        "2009",
        "Flickr engineers describe feature flippers enabling 10+ deploys per day on their photo platform"
      ],
      [
        "2010",
        "Martin Fowler publishes the canonical Feature Toggles article defining toggle categories"
      ],
      [
        "2013",
        "LaunchDarkly founded, creating the first commercial feature flag management platform"
      ],
      [
        "2017",
        "Feature flags become a core pillar of progressive delivery alongside canary deployments"
      ],
      [
        "2022",
        "OpenFeature project launches under CNCF to standardize feature flag APIs across vendors"
      ]
    ],
    "timeline_zh": [
      [
        "2009",
        "Flickr工程师描述功能翻转器，使其照片平台实现每天10+次部署"
      ],
      [
        "2010",
        "Martin Fowler发表经典文章Feature Toggles，定义开关类别"
      ],
      [
        "2013",
        "LaunchDarkly成立，创建首个商业功能开关管理平台"
      ],
      [
        "2017",
        "功能开关与金丝雀发布一起成为渐进式交付的核心支柱"
      ],
      [
        "2022",
        "OpenFeature项目在CNCF下启动，标准化跨厂商的功能开关API"
      ]
    ],
    "dos": [
      "Do establish a flag lifecycle policy with clear ownership and expiry dates, because abandoned flags become permanent technical debt",
      "Do use a centralized flag management system with audit logs, because scattered flags in config files are impossible to govern",
      "Do test both flag-on and flag-off code paths in CI, because untested paths are the most common source of flag-related incidents",
      "Do keep flag evaluation fast and cached, because flags in hot paths affect request latency for every user"
    ],
    "dos_zh": [
      "务必建立功能开关生命周期策略，明确所有权和过期日期，因为被遗弃的开关会成为永久技术债",
      "务必使用带审计日志的集中式开关管理系统，因为散落在配置文件中的开关无法治理",
      "务必在CI中测试开关开启和关闭两条代码路径，因为未测试的路径是开关相关事故的最常见来源",
      "务必保持开关评估快速并缓存结果，因为热路径中的开关会影响每个用户的请求延迟"
    ],
    "donts": [
      "Don't create nested flag dependencies (flag A depends on flag B), because combinatorial complexity makes behavior unpredictable",
      "Don't use feature flags as a long-term configuration mechanism, because they lack the governance and validation of proper config management",
      "Don't skip the cleanup phase after a flag is fully rolled out, because stale flags accumulate and make the codebase harder to understand",
      "Don't store sensitive targeting rules in client-side flag SDKs, because they are visible to end users in browser code"
    ],
    "donts_zh": [
      "不要创建嵌套的开关依赖（开关A依赖开关B），因为组合复杂度使行为不可预测",
      "不要将功能开关作为长期配置管理机制，因为它们缺乏正式配置管理的治理和验证",
      "不要在开关全量发布后跳过清理阶段，因为陈旧开关会积累并使代码库更难理解",
      "不要在客户端开关SDK中存储敏感定位规则，因为它们对终端用户在浏览器代码中可见"
    ],
    "case_study_company": "GitHub",
    "case_study": "GitHub uses a custom feature flag system called Flipper to control the rollout of every major feature. When launching GitHub Copilot, they used feature flags to gradually enable the feature for internal staff, then waitlisted beta users, then paying customers, monitoring usage patterns and error rates at each stage. This approach allowed them to iterate on the product in production without impacting the broader user base, and to instantly disable Copilot features during early reliability issues.",
    "case_study_zh": "GitHub使用名为Flipper的自定义功能开关系统控制每个重大功能的发布。在推出GitHub Copilot时，他们通过功能开关依次为内部员工、等待名单Beta用户、付费客户逐步启用该功能，在每个阶段监控使用模式和错误率。这种方式使他们能够在不影响更大用户群的情况下在生产中迭代产品，并在早期可靠性问题期间立即禁用Copilot功能。",
    "when_not_to_use": [
      "Very small teams with infrequent releases where the overhead of flag management exceeds the deployment risk",
      "Performance-critical hot paths where even microsecond flag evaluation overhead is unacceptable",
      "Systems with strict regulatory requirements that prohibit deploying unreleased code to production",
      "Short-lived projects or prototypes where the lifecycle management overhead is not justified"
    ],
    "when_not_to_use_zh": [
      "发布频率很低的小型团队——开关管理的开销超过部署风险",
      "对性能极其敏感的热路径——即使微秒级的开关评估开销也不可接受",
      "严格监管要求禁止将未发布代码部署到生产的系统",
      "生命周期管理开销不合理的短期项目或原型"
    ],
    "adopters": [
      "GitHub",
      "Netflix",
      "Google",
      "LaunchDarkly",
      "Etsy"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Martin Fowler (2010). \"FeatureToggle\". martinfowler.com.",
    "secondary_sources": [
      "Pete Hodgson (2017). \"Feature Toggles (aka Feature Flags)\". martinfowler.com.",
      "Jez Humble and David Farley (2010). \"Continuous Delivery: Reliable Software Releases through Build, Test, and Deployment Automation\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "canary-deployment",
        "type": "complement"
      },
      {
        "slug": "blue-green-deployment",
        "type": "complement"
      },
      {
        "slug": "branch-by-abstraction",
        "type": "complement"
      }
    ]
  },
  {
    "id": 62,
    "name": "GitOps",
    "name_zh": "GitOps",
    "slug": "gitops",
    "category": "deployment",
    "desc": "Use Git as the single source of truth for infra state",
    "desc_zh": "以Git作为基础设施状态的唯一可信来源",
    "steps": [
      "Store all infrastructure and application configuration declaratively in a Git repository",
      "Enforce code review and branch protection so all changes go through pull requests",
      "Set up a GitOps operator (Argo CD, Flux) to watch the repo for changes",
      "Operator detects drift between desired state in Git and actual cluster state",
      "Operator automatically reconciles the cluster to match the Git-declared desired state"
    ],
    "steps_zh": [
      "以声明式方式将所有基础设施和应用配置存储在Git仓库中",
      "强制代码审查与分支保护，确保所有变更经过Pull Request流程",
      "配置GitOps控制器（Argo CD、Flux）监听仓库变化",
      "控制器检测Git期望状态与集群实际状态之间的偏差",
      "控制器自动将集群协调至Git声明的期望状态"
    ],
    "ai_relevant": false,
    "viz_type": "cycle",
    "viz_labels": [
      "Commit",
      "Detect",
      "Reconcile",
      "Deploy"
    ],
    "viz_labels_zh": [
      "代码提交",
      "变更检测",
      "状态同步",
      "部署"
    ],
    "related": [
      "infrastructure-as-code",
      "three-ways-devops",
      "dora-metrics"
    ],
    "tags": [
      "gitops",
      "declarative",
      "reconciliation",
      "kubernetes"
    ],
    "origin_author": "Weaveworks / Alexis Richardson, 2017",
    "origin_source": "Weaveworks blog post GitOps - Operations by Pull Request (2017); CNCF GitOps Working Group principles (2021)",
    "origin_source_zh": "Weaveworks博客文章《GitOps - 基于Pull Request的运维》（2017）；CNCF GitOps工作组原则（2021）",
    "complexity": "intermediate",
    "when_to_use": [
      "Kubernetes-native environments where declarative configuration aligns naturally with the reconciliation model",
      "Teams that want complete audit trails of every infrastructure change through Git history",
      "Multi-cluster or multi-environment deployments that need consistent, reproducible configuration",
      "Organizations requiring compliance and change control via pull request approval workflows"
    ],
    "when_to_use_zh": [
      "Kubernetes原生环境——声明式配置与协调模型天然契合",
      "希望通过Git历史对每次基础设施变更保留完整审计轨迹的团队",
      "需要一致、可复现配置的多集群或多环境部署",
      "需要通过Pull Request审批工作流实现合规和变更控制的组织"
    ],
    "core_concepts": [
      "Declarative Desired State: All system configuration is described declaratively in Git, defining what the system should look like rather than how to get there",
      "Continuous Reconciliation: A GitOps operator continuously compares the desired state in Git with the actual state in the cluster and corrects any drift automatically",
      "Pull-Based Deployment: Instead of CI pushing changes to the cluster, the cluster-side operator pulls desired state from Git, improving security by not exposing cluster credentials",
      "Immutable Audit Trail: Every change to infrastructure is captured as a Git commit with author, reviewer, timestamp, and diff, providing complete traceability"
    ],
    "core_concepts_zh": [
      "声明式期望状态：所有系统配置在Git中以声明式方式描述，定义系统应有的样子而非如何到达",
      "持续协调：GitOps控制器持续对比Git中的期望状态与集群中的实际状态，自动纠正任何偏差",
      "拉取式部署：集群侧控制器从Git拉取期望状态，而非由CI推送变更到集群，通过不暴露集群凭证提升安全性",
      "不可变审计轨迹：每次基础设施变更都作为Git提交记录，包含作者、审查者、时间戳和差异，提供完整可追溯性"
    ],
    "timeline": [
      [
        "2017",
        "Alexis Richardson of Weaveworks coins the term GitOps and publishes the foundational blog post"
      ],
      [
        "2018",
        "Weaveworks releases Flux v1, the first dedicated GitOps operator for Kubernetes"
      ],
      [
        "2019",
        "Argo CD reaches v1.0, becoming a widely adopted GitOps continuous delivery tool for Kubernetes"
      ],
      [
        "2021",
        "CNCF GitOps Working Group publishes formal GitOps Principles defining the four core tenets"
      ],
      [
        "2023",
        "Flux v2 and Argo CD mature into CNCF graduated/incubating projects with broad enterprise adoption"
      ]
    ],
    "timeline_zh": [
      [
        "2017",
        "Weaveworks的Alexis Richardson创造GitOps一词并发表奠基性博客文章"
      ],
      [
        "2018",
        "Weaveworks发布Flux v1，首个专用Kubernetes GitOps控制器"
      ],
      [
        "2019",
        "Argo CD发布v1.0，成为广泛采用的Kubernetes GitOps持续交付工具"
      ],
      [
        "2021",
        "CNCF GitOps工作组发布正式GitOps原则，定义四大核心信条"
      ],
      [
        "2023",
        "Flux v2和Argo CD发展为CNCF毕业/孵化项目，获得广泛企业级采用"
      ]
    ],
    "dos": [
      "Do separate application config repositories from application source code repositories, because coupling them creates circular deployment triggers",
      "Do implement drift detection alerts so the team knows when manual cluster changes bypass the GitOps workflow",
      "Do use sealed secrets or external secret managers (Vault, AWS Secrets Manager) to avoid storing plaintext secrets in Git",
      "Do structure repositories with clear environment promotion paths (dev -> staging -> production) using branches or directories"
    ],
    "dos_zh": [
      "务必将应用配置仓库与应用源代码仓库分离，因为耦合会造成循环部署触发",
      "务必实现偏差检测告警，使团队知道何时有手动集群变更绕过了GitOps工作流",
      "务必使用密封密钥或外部密钥管理器（Vault、AWS Secrets Manager）以避免在Git中存储明文密钥",
      "务必使用分支或目录结构构建清晰的环境晋升路径（dev -> staging -> production）"
    ],
    "donts": [
      "Don't make manual kubectl changes to the cluster and expect them to persist, because the GitOps operator will revert them to match Git state",
      "Don't store secrets in plaintext in Git repositories, because Git history is permanent and secret rotation becomes impossible",
      "Don't use push-based CI/CD pipelines that bypass the GitOps operator, because this defeats the single-source-of-truth guarantee",
      "Don't create monolithic config repositories for dozens of services, because merge conflicts and slow reconciliation loops will bottleneck teams"
    ],
    "donts_zh": [
      "不要对集群进行手动kubectl变更并期望它们持久存在，因为GitOps控制器会将其还原为Git状态",
      "不要在Git仓库中以明文存储密钥，因为Git历史是永久的且密钥轮换将变得不可能",
      "不要使用绕过GitOps控制器的推送式CI/CD流水线，因为这破坏了单一可信来源的保证",
      "不要为数十个服务创建单一的巨型配置仓库，因为合并冲突和缓慢的协调循环会成为团队瓶颈"
    ],
    "case_study_company": "Intuit",
    "case_study": "Intuit adopted Argo CD as their GitOps platform to manage over 2,500 microservices across multiple Kubernetes clusters serving products like TurboTax and QuickBooks. By requiring all changes to flow through Git pull requests, they achieved full audit compliance for SOX regulations while enabling developers to self-service deployments. Their GitOps adoption reduced deployment lead time by 60% and configuration-related incidents by 40%.",
    "case_study_zh": "Intuit采用Argo CD作为其GitOps平台，管理为TurboTax和QuickBooks等产品服务的多个Kubernetes集群上的2500多个微服务。通过要求所有变更流经Git Pull Request，他们在实现SOX法规全面审计合规的同时，使开发者能够自助部署。GitOps的采用使部署前置时间缩短60%，配置相关事故减少40%。",
    "when_not_to_use": [
      "Non-Kubernetes environments where declarative reconciliation operators are unavailable or immature",
      "Teams with very infrequent infrastructure changes where GitOps tooling overhead is not justified",
      "Imperative or procedural infrastructure changes (one-time database migrations) that do not fit the declarative model",
      "Environments with strict air-gap requirements where the operator cannot reach a Git repository"
    ],
    "when_not_to_use_zh": [
      "非Kubernetes环境——声明式协调控制器不可用或不成熟",
      "基础设施变更频率极低、GitOps工具开销不合理的团队",
      "不适用声明式模型的命令式或过程式基础设施变更（如一次性数据库迁移）",
      "控制器无法访问Git仓库的严格物理隔离环境"
    ],
    "adopters": [
      "Intuit",
      "Tesla",
      "Alibaba",
      "Red Hat",
      "Palo Alto Networks"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Alexis Richardson (2017). \"GitOps - Operations by Pull Request\". Weaveworks Blog.",
    "secondary_sources": [
      "CNCF GitOps Working Group (2021). \"GitOps Principles\". opengitops.dev.",
      "Cornelia Davis (2019). \"Cloud Native Patterns\". Manning Publications."
    ],
    "typed_relations": [
      {
        "slug": "infrastructure-as-code",
        "type": "complement"
      },
      {
        "slug": "three-ways-devops",
        "type": "complement"
      },
      {
        "slug": "dora-metrics",
        "type": "complement"
      }
    ]
  },
  {
    "id": 63,
    "name": "DORA Metrics",
    "name_zh": "DORA指标",
    "slug": "dora-metrics",
    "category": "deployment",
    "desc": "Four elite metrics measuring software delivery performance",
    "desc_zh": "衡量软件交付效能的四项精英指标",
    "steps": [
      "Instrument CI/CD pipelines to capture Deployment Frequency per environment",
      "Track Lead Time for Changes from first commit to production deployment",
      "Measure Change Failure Rate as the percentage of deployments causing incidents",
      "Record Mean Time to Restore (MTTR) from incident detection to full recovery",
      "Benchmark against DORA elite/high/medium/low tiers and set quarterly improvement targets"
    ],
    "steps_zh": [
      "在CI/CD流水线中埋点，采集每个环境的部署频率",
      "跟踪从首次提交到生产部署的变更前置时间",
      "以导致故障的部署比例衡量变更失败率",
      "记录从故障检测到完全恢复的平均修复时间（MTTR）",
      "对照DORA精英/高/中/低等级进行基准对比，制定季度改进目标"
    ],
    "ai_relevant": false,
    "viz_type": "radar",
    "viz_labels": [
      "Deploy Freq",
      "Lead Time",
      "MTTR",
      "Change Fail Rate"
    ],
    "viz_labels_zh": [
      "部署频率",
      "交付前置时间",
      "故障恢复时间",
      "变更失败率"
    ],
    "related": [
      "three-ways-devops",
      "calms-framework",
      "sli-slo-sla"
    ],
    "tags": [
      "dora",
      "delivery-performance",
      "metrics",
      "devops"
    ],
    "origin_author": "Nicole Forsgren, Jez Humble, Gene Kim, 2018",
    "origin_source": "Accelerate: The Science of Lean Software and DevOps (2018); State of DevOps Reports by DORA team (2014-present)",
    "origin_source_zh": "《Accelerate：精益软件与DevOps的科学》（2018）；DORA团队的DevOps现状报告（2014年至今）",
    "complexity": "beginner",
    "when_to_use": [
      "Organizations seeking data-driven insights into their software delivery performance",
      "Teams starting a DevOps transformation that need objective baselines and measurable improvement targets",
      "Engineering leadership communicating delivery health to business stakeholders with standardized benchmarks",
      "Continuous improvement programs that need quantifiable evidence of process changes' impact"
    ],
    "when_to_use_zh": [
      "寻求数据驱动洞察其软件交付效能的组织",
      "启动DevOps转型、需要客观基线和可衡量改进目标的团队",
      "工程领导层使用标准化基准向业务相关方沟通交付健康状况",
      "需要量化证据证明流程变更影响的持续改进计划"
    ],
    "core_concepts": [
      "Deployment Frequency: How often code is deployed to production, reflecting the team's ability to deliver small, frequent batches of value",
      "Lead Time for Changes: The elapsed time from code commit to production deployment, measuring the efficiency of the delivery pipeline",
      "Change Failure Rate: The percentage of deployments that cause a failure in production, indicating the quality and stability of releases",
      "Mean Time to Restore (MTTR): How quickly the team can recover from a production failure, reflecting operational resilience and incident response maturity",
      "Elite Performance: DORA research shows elite performers deploy on-demand, with lead times under one hour, change failure rates below 5%, and MTTR under one hour"
    ],
    "core_concepts_zh": [
      "部署频率：代码部署到生产环境的频率，反映团队以小批量频繁交付价值的能力",
      "变更前置时间：从代码提交到生产部署的耗时，衡量交付流水线的效率",
      "变更失败率：导致生产故障的部署百分比，指示发布的质量和稳定性",
      "平均恢复时间（MTTR）：团队从生产故障中恢复的速度，反映运维韧性和事故响应成熟度",
      "精英表现：DORA研究表明精英级团队按需部署，前置时间低于一小时，变更失败率低于5%，MTTR低于一小时"
    ],
    "timeline": [
      [
        "2014",
        "Nicole Forsgren and Jez Humble publish the first State of DevOps Report with Puppet Labs, identifying key delivery metrics"
      ],
      [
        "2018",
        "Forsgren, Humble, and Kim publish Accelerate, establishing the four key metrics with rigorous statistical research"
      ],
      [
        "2019",
        "Google acquires DORA and integrates the research program into Google Cloud"
      ],
      [
        "2021",
        "DORA adds a fifth metric, Reliability, to the annual State of DevOps Report"
      ],
      [
        "2023",
        "DORA metrics become the industry standard for DevOps maturity assessment, with tooling from Sleuth, LinearB, Jellyfish, and Faros AI"
      ]
    ],
    "timeline_zh": [
      [
        "2014",
        "Nicole Forsgren和Jez Humble与Puppet Labs发布首份DevOps现状报告，识别关键交付指标"
      ],
      [
        "2018",
        "Forsgren、Humble和Kim出版《Accelerate》，通过严谨统计研究确立四大关键指标"
      ],
      [
        "2019",
        "Google收购DORA并将研究计划整合到Google Cloud"
      ],
      [
        "2021",
        "DORA在年度DevOps现状报告中新增第五个指标——可靠性"
      ],
      [
        "2023",
        "DORA指标成为DevOps成熟度评估的行业标准，Sleuth、LinearB、Jellyfish和Faros AI等提供配套工具"
      ]
    ],
    "dos": [
      "Do measure all four metrics together as a balanced scorecard, because optimizing one in isolation leads to dysfunction",
      "Do automate metric collection from CI/CD systems and incident management tools to ensure accuracy and reduce manual reporting burden",
      "Do use metrics as team-level improvement signals rather than individual performance evaluations, because punitive use destroys psychological safety",
      "Do benchmark against your own historical trends first before comparing to industry tiers, because context matters more than absolute numbers"
    ],
    "dos_zh": [
      "务必将四个指标作为平衡记分卡一起衡量，因为孤立优化某一项会导致失衡",
      "务必从CI/CD系统和事故管理工具自动采集指标，以确保准确性并减少手动报告负担",
      "务必将指标用作团队级改进信号而非个人绩效评估，因为惩罚性使用会摧毁心理安全感",
      "务必先与自身历史趋势对标，再与行业等级比较，因为上下文比绝对数字更重要"
    ],
    "donts": [
      "Don't use DORA metrics as a weapon to compare or rank teams competitively, because this incentivizes gaming the metrics rather than genuine improvement",
      "Don't rely on self-reported surveys when automated instrumentation is available, because subjective estimates are consistently inaccurate",
      "Don't set arbitrary DORA tier targets without understanding current constraints, because unrealistic goals demoralize teams",
      "Don't ignore the correlation between metrics -- high deployment frequency with high change failure rate signals systemic quality issues"
    ],
    "donts_zh": [
      "不要将DORA指标作为竞争性比较或排名团队的武器，因为这会激励注水指标而非真正改进",
      "不要在可自动化采集时依赖自我报告的调查，因为主观估计始终不准确",
      "不要在不了解当前约束的情况下设定任意DORA等级目标，因为不切实际的目标会打击团队士气",
      "不要忽视指标间的关联——高部署频率加上高变更失败率意味着系统性质量问题"
    ],
    "case_study_company": "Spotify",
    "case_study": "Spotify adopted DORA metrics across its 200+ autonomous squads to create a unified language for delivery performance without imposing top-down mandates. Each squad tracks their own four metrics on internal dashboards and uses them to identify bottlenecks during retrospectives. Over two years, squads that actively tracked DORA metrics improved their lead time by an average of 45% and reduced change failure rates by 30%.",
    "case_study_zh": "Spotify在其200多个自治小队中采用DORA指标，在不施加自上而下强制要求的情况下创建统一的交付效能语言。每个小队在内部仪表盘上跟踪自己的四项指标，并在回顾中使用它们识别瓶颈。在两年内，积极跟踪DORA指标的小队平均将前置时间缩短45%，变更失败率降低30%。",
    "when_not_to_use": [
      "Very early-stage startups where optimizing delivery metrics is premature compared to finding product-market fit",
      "Hardware or embedded systems with fundamentally different release cadences where daily deployment is physically impossible",
      "Organizations that will weaponize the metrics for punitive team comparisons rather than improvement",
      "One-time delivery projects where ongoing delivery metrics have no long-term value"
    ],
    "when_not_to_use_zh": [
      "极早期创业公司——与寻找产品市场契合度相比，优化交付指标为时过早",
      "发布节奏根本不同、每日部署在物理上不可能的硬件或嵌入式系统",
      "会将指标武器化用于惩罚性团队比较而非改进的组织",
      "持续交付指标没有长期价值的一次性交付项目"
    ],
    "adopters": [
      "Spotify",
      "Google",
      "Microsoft",
      "Capital One",
      "Target"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "reliability",
      "performance"
    ],
    "maturity_ring": "established",
    "primary_source": "Nicole Forsgren, Jez Humble, and Gene Kim (2018). \"Accelerate: The Science of Lean Software and DevOps\". IT Revolution Press.",
    "secondary_sources": [
      "DORA Team (2019). \"State of DevOps Report\". cloud.google.com/devops.",
      "Nicole Forsgren et al. (2021). \"The SPACE of Developer Productivity\". ACM Queue, 19(1)."
    ],
    "typed_relations": [
      {
        "slug": "three-ways-devops",
        "type": "complement"
      },
      {
        "slug": "calms-framework",
        "type": "complement"
      },
      {
        "slug": "sli-slo-sla",
        "type": "complement"
      }
    ]
  },
  {
    "id": 64,
    "name": "CALMS Framework",
    "name_zh": "CALMS框架",
    "slug": "calms-framework",
    "category": "deployment",
    "desc": "Five DevOps pillars: Culture, Automation, Lean, Measurement, Sharing",
    "desc_zh": "DevOps五大支柱：文化、自动化、精益、度量、共享",
    "steps": [
      "Culture: break down dev/ops silos, establish shared ownership and blameless post-mortems",
      "Automation: automate build, test, deploy, and infrastructure provisioning pipelines",
      "Lean: apply value-stream mapping to identify and eliminate workflow bottlenecks",
      "Measurement: define and collect metrics across delivery, reliability, and business outcomes",
      "Sharing: create internal knowledge bases, runbooks, and cross-team learning sessions"
    ],
    "steps_zh": [
      "文化：打破开发/运维壁垒，建立共同责任制和无责任事后回顾机制",
      "自动化：自动化构建、测试、部署及基础设施供应流水线",
      "精益：运用价值流图识别并消除工作流程中的瓶颈",
      "度量：定义并收集交付、可靠性和业务成果的指标",
      "共享：建立内部知识库、运维手册，开展跨团队学习交流"
    ],
    "ai_relevant": false,
    "viz_type": "radar",
    "viz_labels": [
      "Culture",
      "Automation",
      "Lean",
      "Measurement",
      "Sharing"
    ],
    "viz_labels_zh": [
      "文化",
      "自动化",
      "精益",
      "度量",
      "分享"
    ],
    "related": [
      "three-ways-devops",
      "dora-metrics",
      "team-topologies"
    ],
    "tags": [
      "devops",
      "culture",
      "automation",
      "lean",
      "sharing"
    ],
    "origin_author": "Jez Humble & John Willis, 2010",
    "origin_source": "Presented at DevOpsDays and refined in The DevOps Handbook (Gene Kim, Jez Humble, Patrick Debois, John Willis, 2016)",
    "origin_source_zh": "在DevOpsDays上提出，后在《DevOps实践指南》（Gene Kim、Jez Humble、Patrick Debois、John Willis，2016）中完善",
    "complexity": "beginner",
    "when_to_use": [
      "Organizations beginning a DevOps adoption that need a holistic assessment framework beyond just tools",
      "Teams evaluating their DevOps maturity across cultural, technical, and process dimensions",
      "Change agents building a business case for DevOps transformation by identifying gaps in all five pillars",
      "Post-incident reviews seeking root causes that span culture, process, and tooling rather than just technical failures"
    ],
    "when_to_use_zh": [
      "开始DevOps采纳、需要超越工具层面进行全面评估的组织",
      "在文化、技术和流程维度评估DevOps成熟度的团队",
      "通过识别五大支柱差距为DevOps转型构建商业案例的变革推动者",
      "寻求跨越文化、流程和工具层面而非仅技术故障的根因的事故后复盘"
    ],
    "core_concepts": [
      "Culture: Fostering shared responsibility between development and operations through blameless post-mortems, joint on-call, and cross-functional team structures",
      "Automation: Eliminating manual, error-prone processes in build, test, deployment, and infrastructure provisioning to increase speed and reliability",
      "Lean: Applying lean manufacturing principles like value-stream mapping, WIP limits, and waste elimination to software delivery workflows",
      "Measurement: Collecting and acting on metrics across delivery performance, system health, and business outcomes to enable data-driven decisions",
      "Sharing: Breaking knowledge silos through internal tech talks, shared runbooks, post-incident reports, and cross-team collaboration tools"
    ],
    "core_concepts_zh": [
      "文化：通过无责任事后复盘、联合值班和跨职能团队结构，培养开发和运维之间的共同责任意识",
      "自动化：消除构建、测试、部署和基础设施供应中手动且容易出错的流程，以提高速度和可靠性",
      "精益：将价值流图、在制品限制和消除浪费等精益制造原则应用于软件交付工作流",
      "度量：收集并基于交付效能、系统健康和业务成果的指标采取行动，实现数据驱动的决策",
      "共享：通过内部技术分享、共享运维手册、事故报告和跨团队协作工具打破知识壁垒"
    ],
    "timeline": [
      [
        "2008",
        "Patrick Debois and Andrew Shafer discuss Agile Infrastructure at Agile 2008, planting the seeds of DevOps"
      ],
      [
        "2009",
        "First DevOpsDays conference held in Ghent, Belgium, establishing the DevOps movement"
      ],
      [
        "2010",
        "Jez Humble and John Willis articulate the CALMS acronym as a DevOps maturity assessment model"
      ],
      [
        "2016",
        "The DevOps Handbook published, formalizing CALMS alongside The Three Ways and other core practices"
      ],
      [
        "2020",
        "CALMS becomes the standard framework used by consultancies and enterprises for DevOps readiness assessments"
      ]
    ],
    "timeline_zh": [
      [
        "2008",
        "Patrick Debois和Andrew Shafer在Agile 2008上讨论敏捷基础设施，播下DevOps的种子"
      ],
      [
        "2009",
        "首届DevOpsDays大会在比利时根特举行，确立DevOps运动"
      ],
      [
        "2010",
        "Jez Humble和John Willis提出CALMS缩写作为DevOps成熟度评估模型"
      ],
      [
        "2016",
        "《DevOps实践指南》出版，将CALMS与三步法等核心实践一同正式化"
      ],
      [
        "2020",
        "CALMS成为咨询公司和企业用于DevOps就绪评估的标准框架"
      ]
    ],
    "dos": [
      "Do start with Culture assessment before investing in tools, because automation without cultural change creates cargo-cult DevOps",
      "Do measure both technical metrics (DORA) and cultural indicators (employee satisfaction, collaboration frequency) for a complete picture",
      "Do treat the five pillars as interconnected rather than sequential, because weaknesses in one pillar undermine gains in others",
      "Do use CALMS as a periodic health check rather than a one-time assessment, because organizational maturity evolves continuously"
    ],
    "dos_zh": [
      "务必在投资工具前先评估文化，因为没有文化变革的自动化只会创造货物崇拜式的DevOps",
      "务必同时衡量技术指标（DORA）和文化指标（员工满意度、协作频率）以获得完整画面",
      "务必将五大支柱视为相互关联而非顺序执行，因为某一支柱的薄弱会削弱其他支柱的收益",
      "务必将CALMS作为定期健康检查而非一次性评估，因为组织成熟度持续演进"
    ],
    "donts": [
      "Don't reduce DevOps to just Automation while ignoring Culture and Sharing, because tools alone cannot solve collaboration problems",
      "Don't implement blameless post-mortems in name only while still punishing engineers for incidents, because the hypocrisy destroys trust",
      "Don't skip the Lean pillar, because without value-stream analysis you automate waste instead of eliminating it",
      "Don't treat Sharing as optional documentation -- make it active knowledge transfer through pairing, rotations, and internal conferences"
    ],
    "donts_zh": [
      "不要将DevOps简化为仅自动化而忽视文化和共享，因为工具本身无法解决协作问题",
      "不要只在名义上实行无责任事后复盘而仍然惩罚工程师，因为这种虚伪会摧毁信任",
      "不要跳过精益支柱，因为没有价值流分析你只会将浪费自动化而非消除它",
      "不要将共享视为可选的文档——通过结对、轮岗和内部会议进行主动知识传递"
    ],
    "case_study_company": "ING Bank",
    "case_study": "ING Bank Netherlands used the CALMS framework to guide their large-scale agile and DevOps transformation starting in 2015. They reorganized 3,500 employees into cross-functional squads and tribes (Culture), built a unified CI/CD platform (Automation), adopted Kanban boards with WIP limits (Lean), implemented DORA metrics dashboards (Measurement), and created an internal engineering blog and guild system (Sharing). Within three years, their release cycle went from quarterly to multiple times per day.",
    "case_study_zh": "ING银行荷兰分行从2015年开始使用CALMS框架指导其大规模敏捷与DevOps转型。他们将3500名员工重组为跨职能小队和部落（文化），构建了统一的CI/CD平台（自动化），采用带在制品限制的看板（精益），实施DORA指标仪表盘（度量），并创建了内部工程博客和公会系统（共享）。在三年内，其发布周期从季度缩短到每天多次。",
    "when_not_to_use": [
      "Organizations looking for a purely technical deployment framework rather than an organizational change model",
      "Teams that need specific prescriptive practices rather than a high-level assessment framework",
      "Environments where leadership is unwilling to address cultural issues and only wants tool-level changes",
      "Very small teams (2-3 people) where silos don't exist and formal frameworks add unnecessary overhead"
    ],
    "when_not_to_use_zh": [
      "寻求纯技术部署框架而非组织变革模型的组织",
      "需要具体规定性实践而非高级别评估框架的团队",
      "领导层不愿解决文化问题、只想进行工具层面变更的环境",
      "壁垒不存在、正式框架增加不必要开销的极小团队（2-3人）"
    ],
    "adopters": [
      "ING Bank",
      "Target",
      "Ticketmaster",
      "Nordstrom",
      "Nationwide Insurance"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Jez Humble and John Willis (2010). \"CALMS Framework\". Presented at DevOpsDays.",
    "secondary_sources": [
      "Gene Kim, Jez Humble, Patrick Debois, and John Willis (2016). \"The DevOps Handbook\". IT Revolution Press.",
      "Patrick Debois (2009). \"DevOpsDays Ghent: Agile System Administration\". devopsdays.org."
    ],
    "typed_relations": [
      {
        "slug": "three-ways-devops",
        "type": "complement"
      },
      {
        "slug": "dora-metrics",
        "type": "complement"
      },
      {
        "slug": "team-topologies",
        "type": "complement"
      }
    ]
  },
  {
    "id": 65,
    "name": "Three Ways of DevOps",
    "name_zh": "DevOps三步法",
    "slug": "three-ways-devops",
    "category": "deployment",
    "desc": "Flow, Feedback, and Continual Learning as DevOps foundations",
    "desc_zh": "以流动、反馈、持续学习为核心的DevOps基础原则",
    "steps": [
      "First Way - Flow: optimize end-to-end delivery pipeline from dev to ops, minimizing batch size and WIP",
      "Map the value stream and eliminate handoffs, queues, and rework that slow throughput",
      "Second Way - Feedback: build fast feedback loops at every stage with automated testing and monitoring",
      "Create telemetry and alerting so problems are detected and fixed at the source quickly",
      "Third Way - Continual Learning: institutionalize blameless retrospectives, experimentation, and knowledge sharing"
    ],
    "steps_zh": [
      "第一步——流动：优化从开发到运维的端到端交付流水线，最小化批次大小和在制品数量",
      "绘制价值流图，消除拖慢吞吐量的交接、排队和返工环节",
      "第二步——反馈：在每个阶段通过自动化测试和监控构建快速反馈环路",
      "建立遥测和告警体系，在源头快速发现并修复问题",
      "第三步——持续学习：将无责任回顾、实验文化和知识共享制度化"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Flow",
      "Feedback",
      "Learning"
    ],
    "viz_labels_zh": [
      "流动",
      "反馈",
      "持续学习"
    ],
    "related": [
      "calms-framework",
      "dora-metrics",
      "continuous-architecture"
    ],
    "tags": [
      "devops",
      "flow",
      "feedback",
      "continual-learning"
    ],
    "origin_author": "Gene Kim, 2013",
    "origin_source": "The Phoenix Project (Gene Kim, Kevin Behr, George Spafford, 2013); The DevOps Handbook (Gene Kim et al., 2016)",
    "origin_source_zh": "《凤凰项目》（Gene Kim、Kevin Behr、George Spafford，2013）；《DevOps实践指南》（Gene Kim等，2016）",
    "complexity": "beginner",
    "when_to_use": [
      "Organizations seeking a foundational mental model for why DevOps practices work together",
      "Teams experiencing slow delivery pipelines, poor visibility into failures, or repeated incidents",
      "DevOps coaches and leaders who need a narrative framework to explain the philosophy behind specific practices",
      "Value stream mapping exercises where the goal is to identify which Way is weakest and needs investment"
    ],
    "when_to_use_zh": [
      "寻求一个基础心智模型来理解DevOps实践为何协同运作的组织",
      "正在经历交付流水线缓慢、故障可见性差或重复性事故的团队",
      "需要叙事框架来解释具体实践背后理念的DevOps教练和领导者",
      "旨在识别哪个步法最薄弱并需要投入的价值流图绘制活动"
    ],
    "core_concepts": [
      "First Way - Flow: Accelerate the left-to-right flow of work from Development to Operations by reducing batch sizes, eliminating waste, and making work visible",
      "Second Way - Feedback: Create right-to-left feedback loops at every stage so that problems are detected and corrected at their source before propagating downstream",
      "Third Way - Continual Learning: Foster a culture of experimentation, mastery, and knowledge sharing where failures become learning opportunities and improvements are systemic",
      "Systems Thinking: The Three Ways encourage optimizing the entire value stream rather than local silos, recognizing that local optimization often harms global throughput",
      "Theory of Constraints: Inspired by Goldratt's work, the Three Ways focus on identifying and elevating the constraint in the delivery pipeline"
    ],
    "core_concepts_zh": [
      "第一步——流动：通过缩小批次、消除浪费和使工作可视化，加速工作从开发到运维的从左到右流动",
      "第二步——反馈：在每个阶段创建从右到左的反馈环路，在问题向下游传播前在源头检测并纠正",
      "第三步——持续学习：培育实验、精进和知识共享的文化，将失败转化为学习机会，实现系统性改进",
      "系统思维：三步法鼓励优化整个价值流而非局部竖井，认识到局部优化往往损害全局吞吐量",
      "约束理论：受Goldratt工作的启发，三步法聚焦于识别并提升交付流水线中的约束点"
    ],
    "timeline": [
      [
        "2013",
        "Gene Kim publishes The Phoenix Project, introducing the Three Ways as a narrative framework"
      ],
      [
        "2016",
        "The DevOps Handbook provides prescriptive guidance on implementing each of the Three Ways with specific practices and case studies"
      ],
      [
        "2018",
        "Accelerate by Forsgren, Humble, and Kim provides statistical evidence supporting the effectiveness of practices aligned with the Three Ways"
      ],
      [
        "2019",
        "Gene Kim publishes The Unicorn Project adding the Five Ideals as a complementary framework to the Three Ways"
      ],
      [
        "2021",
        "The Three Ways become a foundational element of DevOps certification curricula and enterprise transformation playbooks worldwide"
      ]
    ],
    "timeline_zh": [
      [
        "2013",
        "Gene Kim出版《凤凰项目》，通过虚构IT转型故事以叙事框架方式介绍三步法"
      ],
      [
        "2016",
        "《DevOps实践指南》提供实施三步法的规定性指导，包含具体实践和案例研究"
      ],
      [
        "2018",
        "Forsgren、Humble和Kim的《Accelerate》为三步法相关实践的有效性提供统计证据"
      ],
      [
        "2019",
        "Gene Kim出版《独角兽项目》，新增五大理想作为三步法的补充框架"
      ],
      [
        "2021",
        "三步法成为全球DevOps认证课程和企业转型手册的基础要素"
      ]
    ],
    "dos": [
      "Do work on all three Ways simultaneously rather than sequentially, because Flow without Feedback creates fast but fragile pipelines",
      "Do use value stream mapping to visualize where work waits in queues, because most lead time is wait time, not work time",
      "Do invest in production telemetry and monitoring as a feedback mechanism, because you cannot improve what you cannot see",
      "Do celebrate learning from failures in blameless post-mortems, because fear of blame suppresses the Continual Learning loop"
    ],
    "dos_zh": [
      "务必同时推进三步法而非顺序执行，因为没有反馈的流动会造成快速但脆弱的流水线",
      "务必使用价值流图可视化工作在队列中等待的位置，因为大部分前置时间是等待时间而非工作时间",
      "务必投资生产遥测和监控作为反馈机制，因为看不见的东西无法改进",
      "务必在无责任事后复盘中庆祝从失败中学习，因为对责罚的恐惧会抑制持续学习环路"
    ],
    "donts": [
      "Don't focus only on the First Way (Flow/speed) and ignore quality feedback loops, because deploying fast without catching defects amplifies risk",
      "Don't treat the Three Ways as abstract philosophy without connecting them to concrete practices like CI/CD, monitoring, and blameless post-mortems",
      "Don't optimize individual team handoffs without looking at the end-to-end value stream, because local optimizations create upstream and downstream bottlenecks",
      "Don't skip the Third Way (Continual Learning) because it seems less urgent -- culture is the foundation"
    ],
    "donts_zh": [
      "不要只关注第一步（流动/速度）而忽视质量反馈环路，因为快速部署但不捕获缺陷会放大风险",
      "不要将三步法作为抽象哲学而不与CI/CD、监控和无责任复盘等具体实践关联",
      "不要只优化单个团队的交接而不看端到端价值流，因为局部优化会造成上下游瓶颈",
      "不要因为第三步（持续学习）看似不如技术实践紧迫就跳过——文化才是根基"
    ],
    "case_study_company": "Nordstrom",
    "case_study": "Nordstrom's technology division applied the Three Ways during their 2015-2017 DevOps transformation. For Flow, they moved from biweekly releases to continuous deployment. For Feedback, they implemented full-stack observability with real-time dashboards visible to all teams. For Continual Learning, they established Game Day exercises simulating production failures. The result was a 3x improvement in deployment frequency and a 60% reduction in change failure rate.",
    "case_study_zh": "Nordstrom技术部门在2015-2017年的DevOps转型中应用了三步法。在流动方面，他们从双周发布转变为持续部署。在反馈方面，他们实施了全栈可观测性，所有团队可见的实时仪表盘。在持续学习方面，他们建立了模拟生产故障的Game Day演练。成果是部署频率提升3倍，变更失败率降低60%。",
    "when_not_to_use": [
      "Teams looking for specific prescriptive tooling recommendations rather than a philosophical framework",
      "Organizations that need a maturity assessment model with scoring (use CALMS or DORA instead)",
      "Contexts where the manufacturing metaphors (Theory of Constraints, Lean) don't resonate with the audience",
      "Very small teams already practicing continuous delivery who need advanced patterns rather than foundational philosophy"
    ],
    "when_not_to_use_zh": [
      "寻求具体规定性工具推荐而非哲学框架的团队",
      "需要带评分的成熟度评估模型的组织（应使用CALMS或DORA替代）",
      "制造业隐喻（约束理论、精益）无法引起受众共鸣的场景",
      "已在实践持续交付、需要高级模式而非基础理念的极小团队"
    ],
    "adopters": [
      "Nordstrom",
      "Nike",
      "Capital One",
      "Target",
      "CSG International"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Gene Kim, Kevin Behr, and George Spafford (2013). \"The Phoenix Project: A Novel about IT, DevOps, and Helping Your Business Win\". IT Revolution Press.",
    "secondary_sources": [
      "Gene Kim, Jez Humble, Patrick Debois, and John Willis (2016). \"The DevOps Handbook\". IT Revolution Press.",
      "Eliyahu M. Goldratt (1984). \"The Goal: A Process of Ongoing Improvement\". North River Press."
    ],
    "typed_relations": [
      {
        "slug": "calms-framework",
        "type": "complement"
      },
      {
        "slug": "dora-metrics",
        "type": "complement"
      },
      {
        "slug": "continuous-architecture",
        "type": "complement"
      }
    ]
  },
  {
    "id": 66,
    "name": "Infrastructure as Code",
    "name_zh": "基础设施即代码",
    "slug": "infrastructure-as-code",
    "category": "deployment",
    "desc": "Manage and provision infrastructure through machine-readable config",
    "desc_zh": "通过机器可读配置文件管理和供应基础设施",
    "steps": [
      "Choose an IaC tool (Terraform, Pulumi, CloudFormation) matching your cloud/on-prem stack",
      "Write declarative or imperative configuration files defining all infrastructure resources",
      "Store IaC code in version control with the same branching and review process as application code",
      "Integrate IaC into CI/CD: run plan/diff on PRs and apply on merge to main",
      "Use modules and remote state to promote reuse and manage environment-specific configurations"
    ],
    "steps_zh": [
      "选择与云/本地环境匹配的IaC工具（Terraform、Pulumi、CloudFormation）",
      "编写声明式或命令式配置文件，定义所有基础设施资源",
      "将IaC代码存入版本控制系统，采用与应用代码相同的分支和审查流程",
      "将IaC集成到CI/CD流水线：在PR时执行plan/diff，合并主干时执行apply",
      "使用模块和远程状态（remote state）提升复用性，管理特定环境配置"
    ],
    "ai_relevant": false,
    "viz_type": "tree",
    "viz_labels": [
      "Code",
      "Plan",
      "Apply",
      "State"
    ],
    "viz_labels_zh": [
      "声明代码",
      "变更计划",
      "执行应用",
      "状态管理"
    ],
    "related": [
      "gitops",
      "three-ways-devops",
      "service-mesh-pattern"
    ],
    "tags": [
      "iac",
      "terraform",
      "infrastructure",
      "automation",
      "declarative"
    ],
    "origin_author": "Mark Burgess (CFEngine, 1993) / Luke Kanies (Puppet, 2005) / modern era: Mitchell Hashimoto (Terraform, 2014)",
    "origin_source": "Infrastructure as Code (Kief Morris, O'Reilly, 2016); Terraform: Up & Running (Yevgeniy Brikman, 2017)",
    "origin_source_zh": "《基础设施即代码》（Kief Morris，O'Reilly，2016）；《Terraform：启动与运行》（Yevgeniy Brikman，2017）",
    "complexity": "intermediate",
    "when_to_use": [
      "Any environment with more than a handful of servers or cloud resources that need consistent, repeatable provisioning",
      "Multi-environment setups (dev/staging/prod) where manual configuration drift causes deployment issues",
      "Teams requiring audit trails for every infrastructure change for compliance or regulatory requirements",
      "Disaster recovery scenarios where entire environments must be recreatable from code within predictable timeframes"
    ],
    "when_to_use_zh": [
      "任何需要一致、可重复供应的、拥有多台服务器或云资源的环境",
      "手动配置漂移导致部署问题的多环境设置（dev/staging/prod）",
      "出于合规或监管要求需要对每次基础设施变更保留审计轨迹的团队",
      "需要在可预测的时间框架内从代码重建整个环境的灾难恢复场景"
    ],
    "core_concepts": [
      "Declarative Configuration: Define the desired end-state of infrastructure (what), and let the tool determine how to achieve it, rather than scripting step-by-step procedures",
      "Idempotency: IaC operations can be applied multiple times and always produce the same result, making it safe to re-run after failures",
      "State Management: Tools like Terraform maintain a state file that maps declared resources to actual cloud resources, enabling drift detection",
      "Immutable Infrastructure: Instead of updating servers in place, destroy and recreate them from code, eliminating configuration drift",
      "Module Reuse: Encapsulate common infrastructure patterns into reusable modules shared across teams and environments"
    ],
    "core_concepts_zh": [
      "声明式配置：定义基础设施的期望终态（是什么），让工具决定如何实现，而非编写逐步执行的脚本",
      "幂等性：IaC操作可多次应用且始终产生相同结果，使故障后重新运行是安全的",
      "状态管理：Terraform等工具维护状态文件将声明的资源映射到实际云资源，实现偏差检测",
      "不可变基础设施：不在原地更新服务器，而是从代码销毁并重建，消除配置漂移",
      "模块复用：将常见基础设施模式封装为跨团队和环境共享的可复用模块"
    ],
    "timeline": [
      [
        "1993",
        "Mark Burgess creates CFEngine, the first configuration management tool treating infrastructure as declarative policy"
      ],
      [
        "2005",
        "Luke Kanies releases Puppet, making infrastructure-as-code practical for enterprise Linux/Unix environments"
      ],
      [
        "2012",
        "Ansible released, offering agentless IaC with YAML playbooks that lower the barrier to entry"
      ],
      [
        "2014",
        "Mitchell Hashimoto releases Terraform, introducing the HCL language and multi-cloud provider model"
      ],
      [
        "2020",
        "Pulumi, CDK, and Crossplane emerge, enabling IaC in general-purpose programming languages"
      ]
    ],
    "timeline_zh": [
      [
        "1993",
        "Mark Burgess创建CFEngine，首个将基础设施视为声明式策略的配置管理工具"
      ],
      [
        "2005",
        "Luke Kanies发布Puppet，使基础设施即代码在企业Linux/Unix环境中切实可行"
      ],
      [
        "2012",
        "Ansible发布，以无代理的YAML Playbook降低入门门槛"
      ],
      [
        "2014",
        "Mitchell Hashimoto发布Terraform，引入HCL语言和多云提供商模型"
      ],
      [
        "2020",
        "Pulumi、CDK和Crossplane出现，支持使用通用编程语言进行IaC"
      ]
    ],
    "dos": [
      "Do treat IaC code with the same rigor as application code: code review, automated testing, and CI/CD pipelines",
      "Do use remote state backends (S3, GCS, Terraform Cloud) with state locking to prevent concurrent apply conflicts",
      "Do modularize infrastructure code to avoid monolithic configurations that are hard to reason about",
      "Do implement policy-as-code (OPA, Sentinel) to enforce security and compliance guardrails before infrastructure changes are applied"
    ],
    "dos_zh": [
      "务必以与应用代码同等的严谨对待IaC代码：代码审查、自动化测试和CI/CD流水线",
      "务必使用带状态锁的远程状态后端（S3、GCS、Terraform Cloud）以防止并发应用冲突",
      "务必将基础设施代码模块化，避免难以理解的单体配置",
      "务必实施策略即代码（OPA、Sentinel）在基础设施变更应用前强制执行安全和合规护栏"
    ],
    "donts": [
      "Don't manually modify cloud resources outside of IaC, because this creates drift that the state file cannot track",
      "Don't store Terraform state files in local filesystems or Git, because concurrent access causes corruption and secrets leak",
      "Don't write monolithic Terraform configurations with hundreds of resources, because plan times become unacceptably long",
      "Don't hardcode environment-specific values in modules, because this breaks reusability across environments"
    ],
    "donts_zh": [
      "不要在IaC之外手动修改云资源，因为这造成状态文件无法跟踪的漂移",
      "不要将Terraform状态文件存储在本地文件系统或Git中，因为并发访问导致损坏且密钥泄露",
      "不要编写包含数百个资源的单体Terraform配置，因为计划时间变得不可接受",
      "不要在模块中硬编码环境特定值，因为这破坏了跨环境的可复用性"
    ],
    "case_study_company": "Segment",
    "case_study": "Segment (now part of Twilio) managed their entire AWS infrastructure across 130+ microservices using Terraform. When they migrated from a monorepo to a polyrepo Terraform structure in 2019, they created reusable modules for common patterns like ECS services, RDS databases, and VPCs. This enabled individual teams to spin up fully-compliant, production-ready infrastructure in minutes instead of days, reducing infrastructure provisioning time by 90%.",
    "case_study_zh": "Segment（现为Twilio旗下）使用Terraform管理其130多个微服务的全部AWS基础设施。当他们在2019年从单仓库迁移到多仓库Terraform结构时，为ECS服务、RDS数据库和VPC等常见模式创建了可复用模块。这使各团队能在数分钟而非数天内创建完全合规的生产就绪基础设施，将基础设施供应时间缩短90%。",
    "when_not_to_use": [
      "One-off experimental environments that will be deleted within hours and don't justify the overhead of writing IaC",
      "Legacy systems with infrastructure so complex and undocumented that the initial IaC migration cost is prohibitive",
      "Fully managed serverless architectures where the infrastructure layer is abstracted away by the platform",
      "Teams with zero version control or code review culture -- IaC without process rigor is worse than manual provisioning"
    ],
    "when_not_to_use_zh": [
      "数小时内就会删除的一次性实验环境——编写IaC的开销不合理",
      "基础设施过于复杂且缺乏文档、初始IaC迁移成本令人望而却步的遗留系统",
      "基础设施层已被平台完全抽象的纯无服务器架构",
      "完全没有版本控制或代码审查文化的团队——没有流程严谨性的IaC比手动供应更糟糕"
    ],
    "adopters": [
      "Segment (Twilio)",
      "HashiCorp",
      "Shopify",
      "Stripe",
      "Cloudflare"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Kief Morris (2016). \"Infrastructure as Code: Managing Servers in the Cloud\". O'Reilly Media.",
    "secondary_sources": [
      "Yevgeniy Brikman (2017). \"Terraform: Up & Running\". O'Reilly Media.",
      "Kief Morris (2020). \"Infrastructure as Code, 2nd Edition\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "gitops",
        "type": "complement"
      },
      {
        "slug": "three-ways-devops",
        "type": "complement"
      },
      {
        "slug": "service-mesh-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 67,
    "name": "Twelve-Factor App",
    "name_zh": "十二要素应用",
    "slug": "twelve-factor-app",
    "category": "deployment",
    "desc": "12 principles for building scalable, maintainable cloud services",
    "desc_zh": "构建可扩展、可维护云服务的12条原则",
    "steps": [
      "Audit the codebase: one codebase in version control, with all dependencies explicitly declared and isolated",
      "Externalize all configuration into environment variables; never commit secrets or environment-specific config to code",
      "Treat backing services (databases, queues, caches) as attached resources swappable via config",
      "Enforce strict separation of build, release, and run stages; build once, deploy the artifact to many environments",
      "Design for disposability and horizontal scale: stateless processes, fast startup, graceful shutdown, dev/prod parity"
    ],
    "steps_zh": [
      "审计代码库：单一代码库纳入版本控制，所有依赖显式声明并隔离",
      "将所有配置外部化为环境变量；不将密钥或环境特定配置提交到代码",
      "将后端服务（数据库、队列、缓存）视为可通过配置替换的附加资源",
      "严格分离构建、发布和运行阶段；一次构建，将制品部署到多个环境",
      "为可处置性和水平扩展而设计：无状态进程、快速启动、优雅关闭、开发与生产环境对等"
    ],
    "ai_relevant": false,
    "viz_type": "pyramid",
    "viz_labels": [
      "Build",
      "Release",
      "Run"
    ],
    "viz_labels_zh": [
      "构建",
      "发布",
      "运行"
    ],
    "related": [
      "infrastructure-as-code",
      "gitops",
      "calms-framework"
    ],
    "tags": [
      "twelve-factor",
      "cloud-native",
      "saas",
      "principles",
      "scalability"
    ],
    "origin_author": "Adam Wiggins / Heroku, 2011",
    "origin_source": "12factor.net manifesto (Adam Wiggins, 2011); influenced by Heroku's experience running hundreds of thousands of SaaS applications",
    "origin_source_zh": "12factor.net宣言（Adam Wiggins，2011）；受Heroku运行数十万SaaS应用经验的影响",
    "complexity": "intermediate",
    "when_to_use": [
      "Building new cloud-native applications or SaaS services that will run on PaaS or container platforms",
      "Modernizing legacy monoliths to prepare them for containerization and cloud deployment",
      "Evaluating application architecture readiness for horizontal scaling and multi-environment deployment",
      "Onboarding new developers to cloud-native best practices with a concise, memorable checklist"
    ],
    "when_to_use_zh": [
      "构建将在PaaS或容器平台上运行的新云原生应用或SaaS服务",
      "现代化遗留单体应用，为容器化和云部署做准备",
      "评估应用架构对水平扩展和多环境部署的就绪程度",
      "使用简洁易记的清单为新开发者提供云原生最佳实践入门指导"
    ],
    "core_concepts": [
      "Codebase: One codebase tracked in version control, with many deploys to different environments from the same artifact",
      "Config in Environment: Store configuration that varies between environments in environment variables, strictly separated from code",
      "Backing Services as Attached Resources: Treat databases, message queues, and caches as pluggable resources swappable without code changes",
      "Stateless Processes: Application processes should be stateless and share-nothing, storing persistent data in backing services",
      "Dev/Prod Parity: Keep development, staging, and production environments as similar as possible to reduce deployment surprises"
    ],
    "core_concepts_zh": [
      "代码库：一个代码库纳入版本控制，从同一制品多次部署到不同环境",
      "配置在环境中：将不同环境间变化的配置存储在环境变量中，与代码严格分离",
      "后端服务作为附加资源：将数据库、消息队列和缓存视为可插拔资源，无需代码变更即可替换",
      "无状态进程：应用进程应为无状态和无共享的，将持久数据存储在后端服务中",
      "开发/生产对等：保持开发、预发布和生产环境尽可能相似，以减少部署意外"
    ],
    "timeline": [
      [
        "2011",
        "Adam Wiggins publishes the Twelve-Factor App manifesto at 12factor.net based on Heroku's operational experience"
      ],
      [
        "2013",
        "Docker's rise makes several twelve-factor principles (dependency isolation, disposability) natural defaults"
      ],
      [
        "2015",
        "Kubernetes adoption accelerates twelve-factor practices: config via ConfigMaps/Secrets, stateless pods, horizontal pod autoscaling"
      ],
      [
        "2019",
        "Kevin Hoffman publishes Beyond the Twelve-Factor App adding three new factors for modern cloud-native development"
      ],
      [
        "2023",
        "Twelve-factor principles are deeply embedded in cloud-native frameworks and PaaS offerings as default patterns"
      ]
    ],
    "timeline_zh": [
      [
        "2011",
        "Adam Wiggins基于Heroku的运维经验在12factor.net发布十二要素应用宣言"
      ],
      [
        "2013",
        "Docker的兴起使多项十二要素原则（依赖隔离、可处置性）成为自然默认"
      ],
      [
        "2015",
        "Kubernetes的采用加速了十二要素实践：通过ConfigMaps/Secrets管理配置、无状态Pod、水平Pod自动扩展"
      ],
      [
        "2019",
        "Kevin Hoffman出版《超越十二要素应用》，为现代云原生开发新增三个要素"
      ],
      [
        "2023",
        "十二要素原则深度嵌入云原生框架和PaaS产品中，成为默认模式"
      ]
    ],
    "dos": [
      "Do explicitly declare and isolate all dependencies, because implicit dependencies cause deployment failures across environments",
      "Do use environment variables for all environment-specific configuration, because this enables the same build artifact to deploy anywhere",
      "Do design processes to be stateless and disposable with fast startup, because this enables horizontal scaling",
      "Do treat logs as event streams written to stdout, because centralized log aggregation works best without self-managed log files"
    ],
    "dos_zh": [
      "务必显式声明并隔离所有依赖，因为隐式依赖会导致跨环境部署失败",
      "务必对所有环境特定配置使用环境变量，因为这使同一构建制品能部署到任何地方",
      "务必将进程设计为无状态和可处置的并快速启动，因为这支持水平扩展",
      "务必将日志视为写入stdout的事件流，因为集中式日志聚合在应用不自行管理日志文件时效果最佳"
    ],
    "donts": [
      "Don't store state in local filesystem or in-memory between requests, because horizontal scaling will lose that data",
      "Don't embed environment-specific configuration in source code, because this prevents deploying the same artifact to multiple environments",
      "Don't create long-running background workers tightly coupled to the web process, because they should scale independently",
      "Don't rely on sticky sessions at the load balancer level, because this couples users to specific instances and prevents true statelessness"
    ],
    "donts_zh": [
      "不要在请求之间将状态存储在本地文件系统或内存中，因为水平扩展会丢失这些数据",
      "不要在源代码中嵌入环境特定配置，因为这阻止将同一制品部署到多个环境",
      "不要创建与Web进程紧密耦合的长期运行后台工作进程，因为它们应独立扩展",
      "不要依赖负载均衡器层面的粘性会话，因为这将用户绑定到特定实例并阻止真正的无状态化"
    ],
    "case_study_company": "Heroku",
    "case_study": "Heroku itself is the canonical case study, having developed the Twelve-Factor methodology from observing patterns across hundreds of thousands of applications deployed on their platform. Applications that adhered to twelve-factor principles consistently achieved higher uptime, faster scaling response, and easier debugging. The methodology became the intellectual foundation for an entire generation of PaaS and container platforms.",
    "case_study_zh": "Heroku本身就是经典案例，其十二要素方法论源于观察在其平台上部署的数十万应用的模式。遵循十二要素原则的应用始终实现更高的可用性、更快的扩展响应和更容易的调试。该方法论成为整整一代PaaS和容器平台的思想基础。",
    "when_not_to_use": [
      "Desktop or mobile applications that inherently require local state and filesystem access",
      "High-performance computing or GPU-bound workloads where statelessness conflicts with long-running computation",
      "Legacy enterprise applications with deep dependencies on specific OS features or middleware",
      "Embedded systems or IoT devices where environment variables and attached backing services are impractical"
    ],
    "when_not_to_use_zh": [
      "本质上需要本地状态和文件系统访问的桌面或移动应用",
      "无状态与长期运行计算冲突的高性能计算或GPU密集型工作负载",
      "深度依赖特定操作系统特性或中间件的遗留企业应用",
      "环境变量和附加后端服务不切实际的嵌入式系统或IoT设备"
    ],
    "adopters": [
      "Heroku (Salesforce)",
      "Netflix",
      "Spotify",
      "Shopify",
      "Airbnb"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "scalability",
      "portability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Adam Wiggins (2011). \"The Twelve-Factor App\". 12factor.net.",
    "secondary_sources": [
      "Kevin Hoffman (2016). \"Beyond the Twelve-Factor App\". O'Reilly Media.",
      "Cornelia Davis (2019). \"Cloud Native Patterns\". Manning Publications."
    ],
    "typed_relations": [
      {
        "slug": "infrastructure-as-code",
        "type": "complement"
      },
      {
        "slug": "gitops",
        "type": "complement"
      },
      {
        "slug": "calms-framework",
        "type": "complement"
      }
    ]
  },
  {
    "id": 68,
    "name": "SLI/SLO/SLA",
    "name_zh": "SLI/SLO/SLA",
    "slug": "sli-slo-sla",
    "category": "deployment",
    "desc": "Define and measure service reliability through layered objectives",
    "desc_zh": "通过分层目标体系定义和度量服务可靠性",
    "steps": [
      "Define SLIs (Service Level Indicators): choose quantitative measures like availability, latency, throughput",
      "Set SLOs (Service Level Objectives): establish target thresholds for each SLI (e.g., 99.9% availability)",
      "Calculate Error Budget: the allowable failure margin (1 - SLO) available for risk-taking and innovation",
      "Monitor SLIs in real time with dashboards; burn rate alerts signal when budget depletes too fast",
      "Formalize SLAs (Service Level Agreements) with customers; tie consequences to SLO breaches"
    ],
    "steps_zh": [
      "定义SLI（服务水平指标）：选择可量化的度量指标，如可用性、延迟、吞吐量",
      "设定SLO（服务水平目标）：为每个SLI制定目标阈值（例如99.9%可用性）",
      "计算错误预算：可承受的失败裕量（1 - SLO），用于承担风险和推进创新",
      "通过仪表盘实时监控SLI，燃烧率告警在预算消耗过快时发出提示",
      "与客户签订正式SLA，明确违反SLO时的后果与处置机制"
    ],
    "ai_relevant": true,
    "viz_type": "pyramid",
    "viz_labels": [
      "SLI",
      "SLO",
      "SLA"
    ],
    "viz_labels_zh": [
      "服务指标",
      "服务目标",
      "服务协议"
    ],
    "related": [
      "four-golden-signals",
      "dora-metrics",
      "chaos-engineering"
    ],
    "tags": [
      "sli",
      "slo",
      "sla",
      "reliability",
      "error-budget"
    ],
    "origin_author": "Google SRE team, formalized by Ben Treynor Sloss, ~2003",
    "origin_source": "Site Reliability Engineering: How Google Runs Production Systems (Betsy Beyer et al., 2016)",
    "origin_source_zh": "《SRE：Google运维解密》（Betsy Beyer等，2016）",
    "complexity": "intermediate",
    "when_to_use": [
      "Services with external or internal customers who depend on measurable reliability guarantees",
      "Teams that need to balance shipping new features quickly and maintaining system stability",
      "Organizations establishing error budget policies to objectively decide when to freeze deployments",
      "Incident response processes that need clear thresholds for escalation and severity classification"
    ],
    "when_to_use_zh": [
      "拥有依赖可衡量可靠性保证的外部或内部客户的服务",
      "需要平衡快速发布新功能与维护系统稳定性之间张力的团队",
      "建立错误预算策略以客观决定何时冻结部署的组织",
      "需要清晰升级和严重性分级阈值的事故响应流程"
    ],
    "core_concepts": [
      "Service Level Indicator (SLI): A quantitative measure of service quality, such as request latency at the 99th percentile or availability as the ratio of successful requests",
      "Service Level Objective (SLO): A target value or range for an SLI that the service team commits to maintaining",
      "Error Budget: The inverse of the SLO (1 - SLO) representing the allowable amount of unreliability; teams spend it on risky changes",
      "Burn Rate: The rate at which the error budget is being consumed; multi-window burn rate alerts detect both fast and slow exhaustion",
      "Service Level Agreement (SLA): A formal contract with customers that includes consequences when SLOs are breached, typically set looser than internal SLOs"
    ],
    "core_concepts_zh": [
      "服务水平指标（SLI）：服务质量的定量度量，如第99百分位请求延迟或成功请求比率",
      "服务水平目标（SLO）：服务团队承诺维护的SLI目标值或范围",
      "错误预算：SLO的反面（1 - SLO），代表允许的不可靠量；团队将其用于有风险的变更",
      "燃烧率：错误预算被消耗的速率；多窗口燃烧率告警检测快速和缓慢的预算耗尽",
      "服务水平协议（SLA）：与客户的正式合同，包含违反SLO时的后果，通常设置得比内部SLO更宽松"
    ],
    "timeline": [
      [
        "2003",
        "Google's SRE team under Ben Treynor Sloss formalizes SLI/SLO/error budget practices for internal services"
      ],
      [
        "2016",
        "Site Reliability Engineering book published, making Google's SLI/SLO/SLA framework publicly accessible"
      ],
      [
        "2018",
        "The Site Reliability Workbook provides practical implementation guidance for SLOs with worked examples"
      ],
      [
        "2020",
        "OpenSLO specification launched to standardize SLO definitions as code across monitoring platforms"
      ],
      [
        "2023",
        "SLO-based alerting (burn rate alerts) becomes the recommended approach in major observability platforms"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "Google SRE团队在Ben Treynor Sloss领导下为内部服务正式化SLI/SLO/错误预算实践"
      ],
      [
        "2016",
        "《SRE：Google运维解密》出版，使SLI/SLO/SLA框架对外公开"
      ],
      [
        "2018",
        "《SRE工作手册》提供SLO的实践实施指导和实例"
      ],
      [
        "2020",
        "OpenSLO规范发布，标准化跨监控平台的SLO代码化定义"
      ],
      [
        "2023",
        "基于SLO的告警（燃烧率告警）成为主要可观测性平台的推荐方法"
      ]
    ],
    "dos": [
      "Do choose SLIs that reflect what users actually care about (request success rate, page load time) rather than internal system metrics",
      "Do set SLOs based on user expectations and business impact analysis, not arbitrary five-nines targets",
      "Do implement error budget policies that clearly define what happens when the budget is exhausted",
      "Do use multi-window, multi-burn-rate alerting to catch both sudden outages and gradual degradation"
    ],
    "dos_zh": [
      "务必选择反映用户真正关心的SLI（请求成功率、页面加载时间），而非内部系统指标",
      "务必基于用户期望和业务影响分析设定SLO，而非任意的五个9目标",
      "务必实施错误预算策略，明确定义预算耗尽时的措施",
      "务必使用多窗口、多燃烧率告警来捕获突发宕机和渐进劣化"
    ],
    "donts": [
      "Don't set SLOs at 100%, because it is mathematically impossible to achieve and leaves zero room for innovation",
      "Don't confuse SLOs with SLAs -- SLOs are internal engineering targets while SLAs are customer-facing contracts",
      "Don't create SLOs for every metric imaginable -- pick 3-5 that truly represent user happiness",
      "Don't ignore error budget consumption trends, because running hot signals systemic reliability issues"
    ],
    "donts_zh": [
      "不要将SLO设为100%，因为数学上不可能实现且不留任何创新空间",
      "不要混淆SLO和SLA——SLO是内部工程目标，SLA是面向客户的合同",
      "不要为每个指标创建SLO——选择3-5个真正代表用户满意度的",
      "不要忽视错误预算消耗趋势，因为持续接近上限意味着系统性可靠性问题"
    ],
    "case_study_company": "Google",
    "case_study": "Google pioneered the SLI/SLO/error budget model internally for services like Gmail, Search, and Cloud Platform. For Google Cloud, they publish external SLAs backed by financial credits. Internally, when a service exhausts its error budget, the team must shift focus from feature development to reliability improvements. This mechanism enables Google to maintain high reliability across thousands of services while still shipping features at high velocity.",
    "case_study_zh": "Google在Gmail、搜索和Cloud Platform等服务中率先实践SLI/SLO/错误预算模型。对于Google Cloud，他们发布由财务积分支持的外部SLA。在内部，当服务耗尽错误预算时，团队必须将重心从功能开发转向可靠性改进。这一机制使Google在数千个服务上保持高可靠性的同时仍能高速发布功能。",
    "when_not_to_use": [
      "Internal tools with very few users where formal SLOs add overhead without meaningful reliability improvement",
      "Batch processing systems where request-based SLIs don't apply -- use job success rate metrics instead",
      "Very early-stage products where usage patterns are unknown and setting meaningful SLOs is premature",
      "Systems where 100% correctness is non-negotiable (financial transactions, medical devices) and error budgets don't apply"
    ],
    "when_not_to_use_zh": [
      "用户极少的内部工具——正式SLO增加开销但无有意义的可靠性提升",
      "基于请求的SLI不适用的批处理系统——应使用作业成功率指标",
      "使用模式未知且设定SLO为时过早的极早期产品",
      "100%正确性不可协商且错误预算不适用的系统（金融交易、医疗设备）"
    ],
    "adopters": [
      "Google",
      "Datadog",
      "Uber",
      "Slack",
      "New Relic"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "observability"
    ],
    "maturity_ring": "established",
    "primary_source": "Betsy Beyer, Chris Jones, Jennifer Petoff, and Niall Richard Murphy (2016). \"Site Reliability Engineering: How Google Runs Production Systems\". O'Reilly Media. Chapters 4-5.",
    "secondary_sources": [
      "Betsy Beyer et al. (2018). \"The Site Reliability Workbook: Practical Ways to Implement SRE\". O'Reilly Media.",
      "Alex Hidalgo (2020). \"Implementing Service Level Objectives\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "four-golden-signals",
        "type": "complement"
      },
      {
        "slug": "dora-metrics",
        "type": "complement"
      },
      {
        "slug": "chaos-engineering",
        "type": "complement"
      }
    ]
  },
  {
    "id": 70,
    "name": "MLOps",
    "name_zh": "MLOps",
    "slug": "mlops",
    "category": "deployment",
    "desc": "Apply DevOps practices to ML model lifecycle in production",
    "desc_zh": "将DevOps实践应用于机器学习模型的生产全生命周期",
    "steps": [
      "Version data, code, and model artifacts together; use a feature store for reproducibility",
      "Automate training pipelines triggered by new data or code changes with experiment tracking",
      "Validate model quality with offline metrics (AUC, RMSE) and fairness/bias checks before promotion",
      "Deploy models via A/B testing or shadow mode; monitor data drift and prediction distribution",
      "Trigger automated retraining pipelines when drift or performance degradation is detected"
    ],
    "steps_zh": [
      "对数据、代码和模型制品进行联合版本管理，使用特征存储保障可复现性",
      "以新数据或代码变更为触发器自动化训练流水线，并跟踪实验记录",
      "在模型晋升前通过离线指标（AUC、RMSE）和公平性/偏差检查验证质量",
      "通过A/B测试或影子模式部署模型，监控数据漂移和预测分布变化",
      "检测到漂移或性能劣化时触发自动化重训练流水线"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Develop",
      "Train",
      "Deploy",
      "Monitor"
    ],
    "viz_labels_zh": [
      "开发",
      "训练",
      "部署",
      "监控"
    ],
    "related": [
      "llmops",
      "dora-metrics",
      "canary-deployment"
    ],
    "tags": [
      "mlops",
      "machine-learning",
      "model-lifecycle",
      "training-pipeline"
    ],
    "origin_author": "D. Sculley et al. (Google), 2015; term MLOps coined ~2018",
    "origin_source": "Hidden Technical Debt in Machine Learning Systems (D. Sculley et al., NeurIPS 2015); Google Cloud MLOps guide (2020)",
    "origin_source_zh": "《机器学习系统中的隐性技术债》（D. Sculley等，NeurIPS 2015）；Google Cloud MLOps指南（2020）",
    "complexity": "advanced",
    "when_to_use": [
      "Organizations deploying ML models to production that need reproducibility, monitoring, and automated retraining",
      "Teams managing multiple models across different lifecycle stages",
      "Regulated industries requiring model governance, audit trails, and bias validation",
      "Data science teams transitioning from notebook-based experimentation to production-grade model serving"
    ],
    "when_to_use_zh": [
      "将ML模型部署到生产并需要可复现性、监控和自动化重训练的组织",
      "管理处于不同生命周期阶段的多个模型的团队",
      "需要模型治理、审计轨迹和偏差验证的受监管行业",
      "从Notebook实验过渡到生产级模型服务的数据科学团队"
    ],
    "core_concepts": [
      "Data Versioning: Track and version datasets alongside code and model artifacts to ensure reproducibility",
      "Feature Store: A centralized repository of reusable features ensuring consistency between training and serving",
      "Experiment Tracking: Log hyperparameters, metrics, and artifacts for every training run for reproducible comparisons",
      "Model Registry: A versioned catalog of trained models with metadata governing promotion from staging to production",
      "Data Drift Detection: Continuous monitoring of input data distributions to detect when production data diverges from training data"
    ],
    "core_concepts_zh": [
      "数据版本化：将数据集与代码和模型制品一起跟踪和版本化，确保可复现性",
      "特征存储：集中管理的可复用特征仓库，确保训练和服务之间的一致性",
      "实验追踪：记录每次训练运行的超参数、指标和制品，支持可复现的比较",
      "模型注册中心：带元数据的训练模型版本化目录，管理从预发布到生产的晋升",
      "数据漂移检测：持续监控输入数据分布，检测生产数据何时偏离训练数据"
    ],
    "timeline": [
      [
        "2015",
        "Google publishes Hidden Technical Debt in Machine Learning Systems, highlighting ML operational challenges"
      ],
      [
        "2018",
        "The term MLOps gains traction; MLflow released by Databricks"
      ],
      [
        "2020",
        "Google Cloud publishes MLOps maturity model (Levels 0-2) defining automation stages"
      ],
      [
        "2021",
        "Feature stores (Feast, Tecton) and model registries mature as essential MLOps components"
      ],
      [
        "2023",
        "MLOps platforms consolidate (Weights & Biases, MLflow, Vertex AI, SageMaker) with integrated capabilities"
      ]
    ],
    "timeline_zh": [
      [
        "2015",
        "Google发表《机器学习系统中的隐性技术债》，揭示ML运维挑战"
      ],
      [
        "2018",
        "MLOps一词获得广泛关注；Databricks发布MLflow"
      ],
      [
        "2020",
        "Google Cloud发布MLOps成熟度模型（0-2级）"
      ],
      [
        "2021",
        "特征存储（Feast、Tecton）和模型注册中心日趋成熟"
      ],
      [
        "2023",
        "MLOps平台整合（Weights & Biases、MLflow、Vertex AI、SageMaker）"
      ]
    ],
    "dos": [
      "Do version data, code, and model artifacts as a unified lineage, because reproducibility requires knowing exactly which data trained which model",
      "Do automate model validation gates with offline metrics and bias checks before any model reaches production",
      "Do monitor production model performance continuously, because data drift causes silent accuracy degradation",
      "Do implement automated retraining pipelines triggered by drift detection, because model freshness directly impacts prediction quality"
    ],
    "dos_zh": [
      "务必将数据、代码和模型制品作为统一血缘进行版本化，因为可复现性需要知道哪些数据训练了哪个模型",
      "务必在模型到达生产前自动化模型验证门控，因为手动审查无法规模化",
      "务必持续监控生产模型性能，因为数据漂移会导致无声的准确率劣化",
      "务必实施由漂移检测触发的自动化重训练流水线，因为模型新鲜度直接影响预测质量"
    ],
    "donts": [
      "Don't deploy models without a rollback strategy, because ML models can degrade subtly",
      "Don't train on production data without proper data governance, because PII leakage has severe legal consequences",
      "Don't ignore training-serving skew, because feature computation differences cause silent prediction errors",
      "Don't let data scientists deploy directly from notebooks to production, because notebooks lack testing and monitoring"
    ],
    "donts_zh": [
      "不要在没有回滚策略的情况下部署模型，因为ML模型可能微妙劣化",
      "不要在没有数据治理的情况下使用生产数据训练，因为PII泄露有严重法律后果",
      "不要忽视训练-服务偏差，因为特征计算差异会导致无声的预测错误",
      "不要让数据科学家直接从Notebook部署到生产，因为Notebook缺乏测试和监控"
    ],
    "case_study_company": "Uber",
    "case_study": "Uber built Michelangelo, their internal MLOps platform, to manage the full lifecycle of thousands of ML models powering ride pricing, ETA predictions, fraud detection, and driver matching. Michelangelo provides a feature store, automated training pipelines, a model registry, and real-time serving infrastructure. By standardizing the ML workflow, Uber reduced model development-to-production time from months to days.",
    "case_study_zh": "Uber构建了内部MLOps平台Michelangelo，管理驱动乘车定价、ETA预测、欺诈检测和司机匹配的数千个ML模型的全生命周期。Michelangelo提供特征存储、自动化训练流水线、模型注册中心和实时服务基础设施。通过标准化ML工作流，Uber将模型从开发到生产的时间从数月缩短到数天。",
    "when_not_to_use": [
      "One-off data analysis or research projects where models are not deployed to production",
      "Simple rule-based systems where ML models are not involved",
      "Very early ML exploration where the team is still evaluating whether ML adds value",
      "Tiny teams with a single model where MLOps tooling overhead exceeds the benefit"
    ],
    "when_not_to_use_zh": [
      "模型不部署到生产的一次性数据分析或研究项目",
      "不涉及ML模型的简单规则系统",
      "团队仍在评估ML是否有价值的极早期探索",
      "MLOps工具开销超过收益的单模型小团队"
    ],
    "adopters": [
      "Uber",
      "Airbnb",
      "Spotify",
      "Netflix",
      "Lyft"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "D. Sculley et al. (2015). \"Hidden Technical Debt in Machine Learning Systems\". NeurIPS 2015.",
    "secondary_sources": [
      "Google Cloud (2020). \"MLOps: Continuous Delivery and Automation Pipelines in Machine Learning\". cloud.google.com.",
      "Noah Gift (2021). \"Practical MLOps: Operationalizing Machine Learning Models\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "llmops",
        "type": "extends"
      },
      {
        "slug": "dora-metrics",
        "type": "complement"
      },
      {
        "slug": "canary-deployment",
        "type": "complement"
      }
    ]
  },
  {
    "id": 71,
    "name": "LLMOps",
    "name_zh": "LLMOps",
    "slug": "llmops",
    "category": "deployment",
    "desc": "Operationalize LLM-based apps with prompt, eval, and cost management",
    "desc_zh": "通过提示、评测与成本管理将大语言模型应用投入生产运营",
    "steps": [
      "Version and manage prompts as first-class artifacts in a prompt registry with change history",
      "Build automated LLM evaluation pipelines using reference datasets and judge-model scoring",
      "Implement observability: trace every LLM call with inputs, outputs, latency, and token costs",
      "Gate deployments with eval thresholds; use canary promotion for new model versions or prompts",
      "Monitor production for hallucination rates, latency SLOs, cost per request, and safety violations"
    ],
    "steps_zh": [
      "将提示词作为一等制品纳入提示词注册中心进行版本管理，保留变更历史",
      "使用参考数据集和裁判模型评分，构建自动化的LLM评测流水线",
      "实现可观测性：对每次LLM调用记录输入、输出、延迟和Token成本",
      "以评测阈值作为部署门控，对新模型版本或提示词采用金丝雀晋升策略",
      "在生产环境监控幻觉率、延迟SLO、单次请求成本及安全违规情况"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Prompt",
      "Evaluate",
      "Fine-tune",
      "Deploy"
    ],
    "viz_labels_zh": [
      "提示工程",
      "评估",
      "微调",
      "部署"
    ],
    "related": [
      "mlops",
      "prompt-testing",
      "ai-observability-framework"
    ],
    "tags": [
      "llmops",
      "prompt-management",
      "evaluation",
      "cost-management",
      "ai"
    ],
    "origin_author": "Emerging from AI/ML ops community, ~2023; influenced by Chip Huyen, Hamel Husain, and teams at Anthropic, OpenAI, and Google",
    "origin_source": "Chip Huyen's Building LLM Applications for Production (2023); Anthropic and OpenAI operational guidelines",
    "origin_source_zh": "Chip Huyen的《为生产构建LLM应用》（2023）；Anthropic和OpenAI运维指南",
    "complexity": "advanced",
    "when_to_use": [
      "Production LLM applications where prompt changes are as impactful as code changes",
      "Applications where LLM output quality must be continuously measured against regression benchmarks",
      "Cost-sensitive deployments where token usage, model selection, and caching must be optimized",
      "Safety-critical applications requiring continuous monitoring for hallucinations and policy violations"
    ],
    "when_to_use_zh": [
      "提示词变更与代码变更同等重要的生产LLM应用",
      "LLM输出质量须持续对照回归基准进行度量的应用",
      "需要优化Token使用、模型选择和缓存的成本敏感型部署",
      "需要持续监控幻觉和策略违规的安全关键型应用"
    ],
    "core_concepts": [
      "Prompt Versioning: Treat prompts as first-class versioned artifacts with change history, A/B testing, and rollback capabilities",
      "LLM Evaluation Pipelines: Automated testing using reference datasets, judge models, and human-in-the-loop scoring to validate output quality",
      "Token Economics: Track and optimize token consumption per request, implement semantic caching, and select the smallest adequate model",
      "Guardrails and Safety: Input/output validation layers that detect and block hallucinations, PII leakage, and jailbreak attempts in real-time",
      "LLM Observability: End-to-end tracing of LLM calls including prompt templates, retrieved context, model responses, latency, and cost"
    ],
    "core_concepts_zh": [
      "提示词版本化：将提示词作为一等版本化制品，具备变更历史、A/B测试和回滚能力",
      "LLM评测流水线：使用参考数据集、裁判模型和人工评分的自动化测试，在部署前验证输出质量",
      "Token经济学：跟踪和优化每次请求的Token消耗，实施语义缓存，选择最小够用模型",
      "护栏与安全：实时检测和阻止幻觉、PII泄露和越狱尝试的输入/输出验证层",
      "LLM可观测性：LLM调用的端到端追踪，包括提示词模板、检索上下文、模型响应、延迟和成本"
    ],
    "timeline": [
      [
        "2022",
        "ChatGPT launch triggers massive adoption of LLMs in production, revealing need for operational practices beyond traditional MLOps"
      ],
      [
        "2023",
        "LLMOps emerges as a distinct discipline; LangSmith, Humanloop, and Braintrust launch as dedicated platforms"
      ],
      [
        "2024",
        "Prompt engineering matures with structured approaches; OpenAI, Anthropic, and Google publish production deployment guides"
      ],
      [
        "2025",
        "LLMOps tooling consolidates around evaluation-driven development, with CI/CD pipelines gating on eval scores"
      ],
      [
        "2026",
        "Agent-specific LLMOps patterns emerge, addressing multi-step reasoning traces, tool call auditing, and agentic cost management"
      ]
    ],
    "timeline_zh": [
      [
        "2022",
        "ChatGPT发布引发LLM在生产中的大规模采用，揭示超越传统MLOps的运维实践需求"
      ],
      [
        "2023",
        "LLMOps作为独立学科出现；LangSmith、Humanloop和Braintrust作为专用平台推出"
      ],
      [
        "2024",
        "提示工程以结构化方法走向成熟；OpenAI、Anthropic和Google发布生产部署指南"
      ],
      [
        "2025",
        "LLMOps工具围绕评测驱动开发整合，CI/CD流水线以评测分数作为部署门控"
      ],
      [
        "2026",
        "Agent专用LLMOps模式出现，处理多步推理追踪、工具调用审计和Agent成本管理"
      ]
    ],
    "dos": [
      "Do version prompts in a registry with full change history, because a prompt change can alter application behavior dramatically",
      "Do build automated eval suites that run on every prompt or model change, because manual spot-checking misses subtle regressions",
      "Do implement semantic caching for common queries to reduce latency and token costs",
      "Do monitor token costs per endpoint and per user segment, because LLM inference costs can scale non-linearly"
    ],
    "dos_zh": [
      "务必在注册中心对提示词进行版本管理，因为提示词变更可能剧烈改变应用行为",
      "务必构建在每次提示词或模型变更时运行的自动化评测套件，因为手动抽查无法捕获微妙回归",
      "务必为常见查询实施语义缓存以降低延迟和Token成本",
      "务必按端点和用户分组监控Token成本，因为LLM推理成本可能非线性增长"
    ],
    "donts": [
      "Don't treat prompts as just strings that anyone can edit without review, because unreviewed prompt changes are the top cause of LLM incidents",
      "Don't deploy a new model version without running your eval suite, because even minor version bumps can cause significant output shifts",
      "Don't rely solely on automated metrics for LLM quality -- incorporate human evaluation for nuanced aspects",
      "Don't ignore the latency impact of safety guardrails, because synchronous validation adds to LLM response times"
    ],
    "donts_zh": [
      "不要将提示词视为任何人可以不经审查就编辑的普通字符串，因为未审查的提示词变更是LLM应用事故的头号原因",
      "不要在未运行评测套件的情况下部署新模型版本，因为即使微小版本更新也可能导致显著输出偏移",
      "不要仅依赖自动化指标衡量LLM质量——应纳入人工评估覆盖细微方面",
      "不要忽视安全护栏的延迟影响，因为同步验证会增加LLM响应时间"
    ],
    "case_study_company": "Notion",
    "case_study": "Notion built a comprehensive LLMOps pipeline for their Notion AI product, powering AI writing assistance, summarization, and Q&A across millions of workspaces. They implemented prompt versioning with A/B testing, automated evaluation pipelines using model-based judges and human raters, and real-time monitoring of hallucination rates. Their LLMOps practices enabled weekly prompt iterations without quality regressions, while intelligent model routing kept per-request costs within budget.",
    "case_study_zh": "Notion为其Notion AI产品构建了全面的LLMOps流水线，为数百万工作空间提供AI写作辅助、摘要和问答。他们实施了带A/B测试的提示词版本管理、使用模型评判器和人工评分员的自动化评测流水线，以及幻觉率的实时监控。LLMOps实践使团队能够每周迭代提示词而不产生质量回归，同时智能模型路由将单次请求成本控制在预算内。",
    "when_not_to_use": [
      "Simple LLM integrations where eval pipelines and prompt registries overhead is unjustified",
      "Research and experimentation phases where the application is not yet serving real users",
      "Applications using pre-built LLM APIs with no custom prompts to version or evaluate",
      "Batch-only LLM usage where real-time monitoring and latency SLOs are irrelevant"
    ],
    "when_not_to_use_zh": [
      "评测流水线和提示词注册中心开销不合理的简单LLM集成",
      "应用尚未服务真实用户的研究和实验阶段",
      "使用无自定义提示词的预构建LLM API",
      "仅批量使用LLM、实时监控和延迟SLO无关紧要的场景"
    ],
    "adopters": [
      "Notion",
      "Anthropic",
      "OpenAI",
      "Shopify",
      "Stripe"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Chip Huyen (2023). \"Building LLM Applications for Production\". huyenchip.com.",
    "secondary_sources": [
      "Anthropic (2024). \"Building Effective Agents\". anthropic.com.",
      "OpenAI (2023). \"Production Best Practices\". platform.openai.com."
    ],
    "typed_relations": [
      {
        "slug": "mlops",
        "type": "extends"
      },
      {
        "slug": "prompt-testing",
        "type": "complement"
      },
      {
        "slug": "ai-observability-framework",
        "type": "complement"
      }
    ]
  },
  {
    "id": 72,
    "name": "Agent Deployment Patterns",
    "name_zh": "Agent部署模式",
    "slug": "agent-deployment-patterns",
    "category": "deployment",
    "desc": "Patterns for reliably deploying autonomous AI agents in production",
    "desc_zh": "在生产环境中可靠部署自主AI Agent的架构模式",
    "steps": [
      "Define the agent's execution boundary: tools available, max steps, memory scope, and human-in-the-loop checkpoints",
      "Containerize the agent runtime with pinned LLM versions and tool dependencies for reproducibility",
      "Implement guardrails: input validation, output filtering, cost caps, and rate limiting per agent instance",
      "Deploy with observability: trace multi-step reasoning chains, tool calls, retries, and final outputs end-to-end",
      "Use staged rollout (shadow -> canary -> full) and maintain kill-switch capability to halt runaway agents"
    ],
    "steps_zh": [
      "定义Agent的执行边界：可用工具、最大步骤数、记忆范围及人工介入检查点",
      "将Agent运行时容器化，固定LLM版本和工具依赖以保障可复现性",
      "实施护栏机制：输入验证、输出过滤、成本上限及每个Agent实例的速率限制",
      "部署可观测性能力：端到端追踪多步推理链、工具调用、重试和最终输出",
      "采用分阶段发布（影子→金丝雀→全量），并保留可立即停止失控Agent的熔断开关"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Orchestrate",
      "Invoke",
      "Observe",
      "Scale"
    ],
    "viz_labels_zh": [
      "编排",
      "调用",
      "观测",
      "扩缩容"
    ],
    "related": [
      "agent-reliability-patterns",
      "llmops",
      "canary-deployment"
    ],
    "tags": [
      "agent-deployment",
      "guardrails",
      "staged-rollout",
      "kill-switch",
      "ai"
    ],
    "origin_author": "Emerging from AI engineering community, ~2024; influenced by practices at Anthropic, OpenAI, Google DeepMind, and LangChain",
    "origin_source": "Anthropic's Building effective agents guide (2024); OpenAI production deployment documentation; LangChain/LangGraph agent architecture patterns",
    "origin_source_zh": "Anthropic《构建有效Agent》指南（2024）；OpenAI生产部署文档；LangChain/LangGraph Agent架构模式",
    "complexity": "advanced",
    "when_to_use": [
      "Deploying autonomous AI agents that make multi-step decisions with real-world tool access",
      "Production systems where agent failures can have financial, security, or reputational consequences",
      "Organizations needing to audit and trace every decision an agent makes for compliance",
      "Teams scaling from prototype agents to production-grade systems serving real users with SLOs"
    ],
    "when_to_use_zh": [
      "部署具有真实世界工具访问权限的自主AI Agent进行多步决策",
      "Agent故障可能产生财务、安全或声誉后果的生产系统",
      "需要审计和追踪Agent每个决策以满足合规需求的组织",
      "从原型Agent扩展到服务真实用户并带有SLO的生产级系统的团队"
    ],
    "core_concepts": [
      "Execution Boundary: Define clear limits on what an agent can do -- tools, max steps, memory scope, and cost ceiling per invocation",
      "Human-in-the-Loop Checkpoints: Critical or irreversible actions require human approval before the agent proceeds",
      "Agent Observability: End-to-end tracing of the full reasoning chain -- each thought, tool call, tool response, retry, and final answer",
      "Kill Switch: The ability to instantly halt a running agent across all instances when anomalous behavior is detected",
      "Shadow Mode: Deploy new agent versions in parallel with production agents, comparing decisions without serving real users"
    ],
    "core_concepts_zh": [
      "执行边界：明确定义Agent的能力范围——工具、最大步骤数、记忆范围和每次调用的成本上限",
      "人工介入检查点：关键或不可逆操作需要人工批准后Agent才能继续",
      "Agent可观测性：完整推理链的端到端追踪——每个思考、工具调用、工具响应、重试和最终答案",
      "熔断开关：当检测到异常行为时，能立即停止所有实例上运行的Agent",
      "影子模式：将新Agent版本与生产Agent并行部署，比较决策但不服务真实用户"
    ],
    "timeline": [
      [
        "2023",
        "LangChain and AutoGPT popularize autonomous agent architectures, revealing production deployment challenges"
      ],
      [
        "2024",
        "Anthropic publishes Building effective agents guide; OpenAI releases Assistants API with built-in tool use"
      ],
      [
        "2024",
        "Agent observability platforms emerge (LangSmith, Arize Phoenix) with multi-step trace visualization"
      ],
      [
        "2025",
        "Agent deployment patterns formalize around execution boundaries, kill switches, and staged rollouts"
      ],
      [
        "2026",
        "Multi-agent orchestration deployment patterns mature, addressing inter-agent communication and cost management"
      ]
    ],
    "timeline_zh": [
      [
        "2023",
        "LangChain和AutoGPT推广自主Agent架构，揭示生产部署挑战"
      ],
      [
        "2024",
        "Anthropic发布《构建有效Agent》指南；OpenAI发布Assistants API"
      ],
      [
        "2024",
        "Agent可观测性平台出现（LangSmith、Arize Phoenix），提供多步追踪可视化"
      ],
      [
        "2025",
        "Agent部署模式围绕执行边界、熔断开关和分阶段发布正式化"
      ],
      [
        "2026",
        "多Agent编排部署模式日趋成熟，处理Agent间通信和成本管理"
      ]
    ],
    "dos": [
      "Do set hard cost and step limits per agent invocation, because LLM-powered agents can enter infinite reasoning loops",
      "Do implement human-in-the-loop approval for irreversible actions, because autonomous agents will eventually make mistakes",
      "Do trace the full reasoning chain for every agent run, because debugging agent failures without traces is nearly impossible",
      "Do deploy new agent versions in shadow mode first to compare decisions against the current production version"
    ],
    "dos_zh": [
      "务必为每次Agent调用设置硬性成本和步骤限制，因为LLM驱动的Agent可能进入无限推理循环",
      "务必为不可逆操作实施人工介入审批，因为自主Agent终将犯错",
      "务必对每次Agent运行追踪完整推理链，因为没有追踪几乎不可能调试Agent故障",
      "务必先以影子模式部署新Agent版本，将其决策与生产版本比较"
    ],
    "donts": [
      "Don't give agents unrestricted tool access in production, because prompt injection with broad permissions can cause catastrophic damage",
      "Don't deploy agents without a kill switch, because runaway agents can accumulate massive costs within minutes",
      "Don't skip evaluation on edge cases and adversarial inputs, because agents are more unpredictable than traditional software",
      "Don't treat agent deployments like stateless service deployments, because agents carry reasoning state across steps"
    ],
    "donts_zh": [
      "不要在生产中给Agent不受限的工具访问权限，因为提示注入在宽泛权限下可能造成灾难性损害",
      "不要在没有熔断开关的情况下部署Agent，因为失控Agent可能在几分钟内累积巨额成本",
      "不要跳过对边缘情况和对抗性输入的评测，因为Agent比传统软件更不可预测",
      "不要将Agent部署等同于无状态服务部署，因为Agent在步骤间携带推理状态"
    ],
    "case_study_company": "Klarna",
    "case_study": "Klarna deployed an AI customer service agent powered by OpenAI that handles the equivalent of 700 full-time human agents, resolving two-thirds of customer support conversations in its first month of operation in 2024. They implemented strict execution boundaries, human escalation paths for complex cases, and comprehensive tracing of every conversation. The deployment followed a staged rollout: shadow mode for two months, then 5% of conversations, then gradual scaling to full production, with kill-switch capability active throughout.",
    "case_study_zh": "Klarna部署了由OpenAI驱动的AI客服Agent，在2024年投入运营的第一个月内处理了相当于700名全职人工客服的工作量，解决了三分之二的客户支持对话。他们实施了严格的执行边界、复杂案例的人工升级路径，以及对每次对话的全面追踪。部署遵循分阶段发布：两个月影子模式，然后5%的对话，再逐步扩展到全量生产，全程保持熔断开关激活。",
    "when_not_to_use": [
      "Simple single-turn LLM interactions that don't involve multi-step reasoning or tool use",
      "Environments with no tolerance for non-deterministic behavior",
      "Internal prototypes or demos where production reliability and safety are not concerns",
      "Use cases where a traditional deterministic workflow engine is sufficient"
    ],
    "when_not_to_use_zh": [
      "不涉及多步推理或工具使用的简单单轮LLM交互",
      "不容忍非确定性行为的环境",
      "不关注生产可靠性和安全性的内部原型或演示",
      "传统确定性工作流引擎已足够的场景"
    ],
    "adopters": [
      "Klarna",
      "Anthropic",
      "OpenAI",
      "Replit",
      "Cognition (Devin)"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "security"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Anthropic (2024). \"Building Effective Agents\". anthropic.com.",
    "secondary_sources": [
      "OpenAI (2023). \"Production Deployment Best Practices\". platform.openai.com.",
      "Harrison Chase et al. (2023). \"LangGraph: Multi-Actor Agent Framework\". langchain.com."
    ],
    "typed_relations": [
      {
        "slug": "agent-reliability-patterns",
        "type": "complement"
      },
      {
        "slug": "llmops",
        "type": "complement"
      },
      {
        "slug": "canary-deployment",
        "type": "complement"
      }
    ]
  },
  {
    "id": 183,
    "name": "Progressive Delivery",
    "name_zh": "渐进式交付",
    "slug": "progressive-delivery",
    "category": "deployment",
    "desc": "Combine canary, feature flags, and observability for controlled rollouts",
    "desc_zh": "结合金丝雀发布、特性开关和可观测性实现受控发布",
    "steps": [
      "Define the release audience using cohort selectors (percentage, region, user segment)",
      "Deploy the new version behind feature flags with canary traffic routing enabled",
      "Instrument automated analysis of key SLIs (latency, error rate, saturation) during rollout",
      "Progressively widen the blast radius in stages, gating each stage on metric thresholds",
      "Automatically promote to 100% or trigger rollback based on observability verdicts"
    ],
    "steps_zh": [
      "使用群组选择器（百分比、地区、用户分群）定义发布受众",
      "在启用金丝雀流量路由的同时，将新版本部署在特性开关后面",
      "在发布过程中对关键 SLI（延迟、错误率、饱和度）进行自动化分析",
      "分阶段逐步扩大影响半径，每个阶段以指标阈值为门控",
      "基于可观测性判定自动提升到 100% 或触发回滚"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Feature Flag",
      "Canary",
      "A/B Test",
      "Full Release"
    ],
    "viz_labels_zh": [
      "功能标志",
      "金丝雀",
      "A/B测试",
      "全量发布"
    ],
    "related": [
      "canary-deployment",
      "feature-flags",
      "slo-as-practice",
      "opentelemetry"
    ],
    "tags": [
      "progressive-delivery",
      "canary",
      "feature-flags",
      "observability",
      "rollout"
    ],
    "origin_author": "James Governor (RedMonk), 2018",
    "origin_source": "Progressive Delivery (redmonk.com, 2018); popularized by Weaveworks & Flagger",
    "origin_source_zh": "渐进式交付（redmonk.com，2018）；由 Weaveworks 和 Flagger 推广",
    "complexity": "advanced",
    "when_to_use": [
      "High-traffic services where a bad deploy can affect millions of users within minutes",
      "Teams practicing continuous deployment that need automated safety gates between stages",
      "Organizations combining multiple deployment strategies (canary + feature flags) under a unified workflow",
      "Environments where business metrics (conversion, revenue) must be validated alongside technical SLIs"
    ],
    "when_to_use_zh": [
      "高流量服务中，一次错误部署可能在几分钟内影响数百万用户",
      "实践持续部署的团队需要在各阶段之间设置自动安全门控",
      "将多种部署策略（金丝雀 + 特性开关）统一在单一工作流下的组织",
      "需要在验证技术 SLI 的同时验证业务指标（转化率、收入）的环境"
    ],
    "core_concepts": [
      "Gated Stages: Each rollout phase requires explicit pass/fail criteria before progressing to the next audience slice",
      "Automated Analysis: Observability platforms compare canary metrics against baseline in real time to eliminate human judgment delays",
      "Blast Radius Control: The combination of feature flags and traffic splitting limits the maximum user impact at any given stage",
      "Unified Rollout Model: Canary deployment, feature flagging, and A/B testing are composed into a single progressive pipeline"
    ],
    "core_concepts_zh": [
      "门控阶段：每个发布阶段在进入下一批受众前需满足明确的通过/失败标准",
      "自动化分析：可观测性平台实时将金丝雀指标与基线进行比较，消除人工判断延迟",
      "爆炸半径控制：特性开关与流量分割的组合限制了任一阶段的最大用户影响",
      "统一发布模型：金丝雀发布、特性开关和 A/B 测试组合为单一渐进式流水线"
    ],
    "timeline": [
      [
        "2018",
        "James Governor coins 'Progressive Delivery' at RedMonk, synthesizing canary + flags + observability"
      ],
      [
        "2019",
        "Weaveworks releases Flagger, the first Kubernetes-native progressive delivery controller"
      ],
      [
        "2020",
        "Argo Rollouts adds progressive delivery with automated analysis integration"
      ],
      [
        "2022",
        "Major cloud providers (AWS App Mesh, GCP Traffic Director) embed progressive delivery in managed services"
      ],
      [
        "2024",
        "AI-driven rollout analysis (anomaly detection on SLIs) becomes standard in enterprise progressive delivery platforms"
      ]
    ],
    "timeline_zh": [
      [
        "2018",
        "James Governor 在 RedMonk 提出「渐进式交付」，综合金丝雀 + 特性开关 + 可观测性"
      ],
      [
        "2019",
        "Weaveworks 发布 Flagger，首个 Kubernetes 原生渐进式交付控制器"
      ],
      [
        "2020",
        "Argo Rollouts 添加带自动化分析集成的渐进式交付功能"
      ],
      [
        "2022",
        "主要云厂商（AWS App Mesh、GCP Traffic Director）在托管服务中内置渐进式交付"
      ],
      [
        "2024",
        "AI 驱动的发布分析（SLI 异常检测）成为企业级渐进式交付平台的标配"
      ]
    ],
    "dos": [
      "Define clear metric thresholds and minimum observation windows before starting any progressive rollout",
      "Use automated canary analysis tools (Kayenta, Flagger) to remove subjective human judgment from promotion decisions",
      "Combine technical metrics (error rate, latency) with business metrics (conversion, revenue) for holistic rollout health",
      "Run progressive delivery in staging first to calibrate analysis sensitivity and avoid false positives in production"
    ],
    "dos_zh": [
      "在启动渐进式发布前明确定义指标阈值和最短观察窗口",
      "使用自动化金丝雀分析工具（Kayenta、Flagger）消除晋升决策中的主观人为判断",
      "将技术指标（错误率、延迟）与业务指标（转化率、收入）结合，全面评估发布健康状况",
      "先在预发布环境中运行渐进式交付以校准分析灵敏度，避免在生产环境中出现假阳性"
    ],
    "donts": [
      "Don't skip the baseline measurement phase — without a reliable baseline, canary comparison is meaningless",
      "Don't set observation windows too short — transient traffic patterns can mask regressions",
      "Don't ignore the long tail of latency (p99, p999) — median-only analysis misses the most painful user experiences",
      "Don't treat progressive delivery as a substitute for pre-production testing — it is a safety net, not a test suite"
    ],
    "donts_zh": [
      "不要跳过基线测量阶段——没有可靠的基线，金丝雀比较毫无意义",
      "不要将观察窗口设置得太短——瞬时流量模式可能掩盖回归问题",
      "不要忽略延迟长尾（p99、p999）——仅分析中位数会遗漏最痛苦的用户体验",
      "不要将渐进式交付视为预生产测试的替代品——它是安全网，不是测试套件"
    ],
    "case_study_company": "Intuit",
    "case_study": "Intuit adopted progressive delivery for TurboTax during the 2022 tax season, combining Argo Rollouts with custom Kayenta canary analysis. Each release progressed through four stages (1%, 10%, 50%, 100%) with automated SLI checks at each gate. This approach caught a latency regression at the 10% stage that would have impacted millions of users during peak filing, enabling automatic rollback within 3 minutes of detection.",
    "case_study_zh": "Intuit 在 2022 年报税季为 TurboTax 采用渐进式交付，将 Argo Rollouts 与自定义 Kayenta 金丝雀分析相结合。每次发布经过四个阶段（1%、10%、50%、100%），每个门控处进行自动化 SLI 检查。该方法在 10% 阶段捕获了一个延迟回归问题，否则将在高峰报税期影响数百万用户，检测到后 3 分钟内即完成自动回滚。",
    "when_not_to_use": [
      "Small internal tools with a handful of users where the overhead of multi-stage rollouts is unjustified",
      "Batch processing systems that do not serve real-time traffic and cannot be canary-tested",
      "Environments with insufficient observability infrastructure to support automated canary analysis",
      "Deployments requiring atomic all-or-nothing cutover (use blue-green instead)"
    ],
    "when_not_to_use_zh": [
      "用户极少的小型内部工具——多阶段发布的开销不合理",
      "不提供实时流量且无法进行金丝雀测试的批处理系统",
      "可观测性基础设施不足以支持自动化金丝雀分析的环境",
      "需要原子性全有或全无切换的部署（应使用蓝绿部署替代）"
    ],
    "adopters": [
      "Intuit",
      "Weaveworks",
      "Netflix",
      "Spotify",
      "DoorDash"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "James Governor (2018). \"Progressive Delivery\". redmonk.com.",
    "secondary_sources": [
      "Jez Humble and David Farley (2010). \"Continuous Delivery: Reliable Software Releases through Build, Test, and Deployment Automation\". Addison-Wesley.",
      "Weaveworks (2019). \"Progressive Delivery with Flagger\". flagger.app."
    ],
    "typed_relations": [
      {
        "slug": "canary-deployment",
        "type": "extends"
      },
      {
        "slug": "feature-flags",
        "type": "complement"
      },
      {
        "slug": "slo-as-practice",
        "type": "complement"
      },
      {
        "slug": "opentelemetry",
        "type": "complement"
      }
    ]
  },
  {
    "id": 184,
    "name": "Immutable Infrastructure",
    "name_zh": "不可变基础设施",
    "slug": "immutable-infrastructure",
    "category": "deployment",
    "desc": "Never patch; replace with new images",
    "desc_zh": "永不修补，用全新镜像替换",
    "steps": [
      "Build a machine or container image from a declarative specification (Dockerfile, Packer template, AMI builder)",
      "Version-tag and store the image in an artifact registry with full provenance metadata",
      "Deploy by replacing running instances with new image versions, never modifying live instances",
      "Validate the new deployment through health checks and smoke tests before draining old instances",
      "Decommission old images after a retention period; audit the registry for drift or orphaned artifacts"
    ],
    "steps_zh": [
      "从声明式规范（Dockerfile、Packer 模板、AMI 构建器）构建机器或容器镜像",
      "对镜像进行版本标记并存储在制品仓库中，附带完整的来源元数据",
      "通过用新镜像版本替换运行中的实例进行部署，永不修改在线实例",
      "在排空旧实例前通过健康检查和冒烟测试验证新部署",
      "保留期后退役旧镜像；审计仓库以发现漂移或孤立制品"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Image Build",
      "Provision",
      "Replace",
      "Retire"
    ],
    "viz_labels_zh": [
      "镜像构建",
      "预置",
      "替换",
      "下线"
    ],
    "related": [
      "infrastructure-as-code",
      "gitops",
      "blue-green-deployment"
    ],
    "tags": [
      "immutable",
      "infrastructure",
      "images",
      "no-patch",
      "deployment"
    ],
    "origin_author": "Chad Fowler, 2013 (blog post: 'Trash Your Servers and Burn Your Code')",
    "origin_source": "Trash Your Servers and Burn Your Code (Chad Fowler, 2013); popularized by Docker and HashiCorp",
    "origin_source_zh": "《扔掉你的服务器，烧掉你的代码》（Chad Fowler，2013）；由 Docker 和 HashiCorp 推广",
    "complexity": "intermediate",
    "when_to_use": [
      "Environments requiring strict auditability and reproducibility (finance, healthcare, regulated industries)",
      "Cloud-native architectures where instances are ephemeral and auto-scaled",
      "Teams experiencing configuration drift across long-lived servers that causes production incidents",
      "Security-sensitive systems where patching in place leaves uncertain intermediate states"
    ],
    "when_to_use_zh": [
      "要求严格可审计性和可重现性的环境（金融、医疗、受监管行业）",
      "实例短暂且自动扩缩容的云原生架构",
      "长期运行服务器上配置漂移导致生产事故的团队",
      "就地修补会留下不确定中间状态的安全敏感系统"
    ],
    "core_concepts": [
      "No In-Place Mutation: Running instances are never modified — all changes require building and deploying a new image",
      "Image as Artifact: The deployable unit is a versioned, tested image stored in a registry, not a set of scripts run on a live server",
      "Reproducibility: Any deployment can be exactly recreated from the image tag, eliminating 'works on my machine' and snowflake servers",
      "Disposability: Instances are cattle, not pets — they can be destroyed and recreated at any time without manual intervention"
    ],
    "core_concepts_zh": [
      "禁止就地变更：运行中的实例永不修改——所有变更都需要构建和部署新镜像",
      "镜像即制品：可部署单元是存储在仓库中的版本化、已测试的镜像，而非在活服务器上运行的脚本集",
      "可重现性：任何部署都可以从镜像标签精确重建，消除「在我的机器上能跑」和雪花服务器问题",
      "可丢弃性：实例是牲畜而非宠物——可以随时销毁和重建，无需人工干预"
    ],
    "timeline": [
      [
        "2013",
        "Chad Fowler publishes 'Trash Your Servers and Burn Your Code', popularizing the immutable infrastructure concept"
      ],
      [
        "2013",
        "Docker launches, making immutable container images practical and ubiquitous"
      ],
      [
        "2014",
        "HashiCorp releases Packer for building machine images across providers"
      ],
      [
        "2017",
        "Kubernetes adoption normalizes immutable containers as the default deployment unit"
      ],
      [
        "2022",
        "Supply chain security (SLSA, Sigstore) adds provenance and signing to immutable image pipelines"
      ]
    ],
    "timeline_zh": [
      [
        "2013",
        "Chad Fowler 发表《扔掉你的服务器，烧掉你的代码》，推广不可变基础设施概念"
      ],
      [
        "2013",
        "Docker 发布，使不可变容器镜像变得实用且无处不在"
      ],
      [
        "2014",
        "HashiCorp 发布 Packer，支持跨云构建机器镜像"
      ],
      [
        "2017",
        "Kubernetes 的普及使不可变容器成为默认部署单元"
      ],
      [
        "2022",
        "供应链安全（SLSA、Sigstore）为不可变镜像流水线增加溯源和签名"
      ]
    ],
    "dos": [
      "Invest in fast image build pipelines — slow builds are the #1 adoption blocker for immutable infrastructure",
      "Tag every image with the commit SHA and build timestamp for complete traceability",
      "Use multi-stage builds to minimize image size and reduce the attack surface",
      "Externalize all configuration and secrets from images using environment variables or secret managers"
    ],
    "dos_zh": [
      "投资快速镜像构建流水线——构建慢是不可变基础设施采用的头号障碍",
      "为每个镜像标记提交 SHA 和构建时间戳以实现完全可追溯",
      "使用多阶段构建最小化镜像大小并减少攻击面",
      "使用环境变量或密钥管理器将所有配置和密钥从镜像中外部化"
    ],
    "donts": [
      "Don't SSH into running containers or instances to apply hotfixes — this violates the core immutability principle",
      "Don't bake secrets or credentials into images — they become visible in layer history and registries",
      "Don't skip vulnerability scanning of base images — an immutable image is only as secure as its layers",
      "Don't keep unlimited old image versions — storage costs grow quickly without a retention policy"
    ],
    "donts_zh": [
      "不要 SSH 进入运行中的容器或实例来应用热修复——这违反了核心不可变原则",
      "不要在镜像中写入密钥或凭证——它们会在层历史和仓库中可见",
      "不要跳过基础镜像的漏洞扫描——不可变镜像的安全性取决于其所有层",
      "不要保留无限多的旧镜像版本——没有保留策略存储成本会快速增长"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix pioneered immutable infrastructure at scale with their Bakery system, which bakes AMIs (Amazon Machine Images) for every deployment. By 2015, Netflix was deploying thousands of immutable AMIs per day across AWS, eliminating configuration drift that had previously caused unpredictable production failures. The Bakery pipeline — combined with Spinnaker for orchestration — became the gold standard for immutable deployment at cloud scale.",
    "case_study_zh": "Netflix 通过其 Bakery 系统率先大规模实践不可变基础设施，为每次部署烘焙 AMI（Amazon 机器镜像）。到 2015 年，Netflix 每天在 AWS 上部署数千个不可变 AMI，消除了此前导致不可预测生产故障的配置漂移。Bakery 流水线与 Spinnaker 编排相结合，成为云规模不可变部署的黄金标准。",
    "when_not_to_use": [
      "Legacy environments where in-place patching is the only viable option due to licensing or hardware constraints",
      "Development or debugging workflows where developers need to iterate rapidly on running instances",
      "Stateful systems (databases, file servers) where data persistence across replacements requires special handling",
      "Extremely resource-constrained edge devices where rebuilding and redeploying images is impractical"
    ],
    "when_not_to_use_zh": [
      "由于许可或硬件限制，就地修补是唯一可行选项的遗留环境",
      "开发者需要在运行实例上快速迭代的开发或调试工作流",
      "需要跨替换持久化数据的有状态系统（数据库、文件服务器）需要特殊处理",
      "重建和重新部署镜像不切实际的极端资源受限边缘设备"
    ],
    "adopters": [
      "Netflix",
      "Google",
      "HashiCorp",
      "Heroku",
      "AWS (Elastic Beanstalk)"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "security"
    ],
    "maturity_ring": "established",
    "primary_source": "Chad Fowler (2013). \"Trash Your Servers and Burn Your Code: Immutable Infrastructure and Disposable Components\". chadfowler.com.",
    "secondary_sources": [
      "Kief Morris (2016). \"Infrastructure as Code: Managing Servers in the Cloud\". O'Reilly Media.",
      "Docker, Inc. (2013). \"Docker: An Open Platform for Distributed Applications\". docker.com."
    ],
    "typed_relations": [
      {
        "slug": "infrastructure-as-code",
        "type": "complement"
      },
      {
        "slug": "gitops",
        "type": "complement"
      },
      {
        "slug": "blue-green-deployment",
        "type": "complement"
      }
    ]
  },
  {
    "id": 185,
    "name": "Chaos Engineering Practices",
    "name_zh": "混沌工程实践",
    "slug": "chaos-engineering-practices",
    "category": "deployment",
    "desc": "Operational practices for controlled failure injection: GameDays, blast radius control, and resilience validation",
    "desc_zh": "受控故障注入的运维实践：GameDay 演练、爆炸半径控制与韧性验证",
    "steps": [
      "Define steady-state hypotheses based on business-critical SLIs and expected system behavior",
      "Design experiments with explicit blast radius limits (scope, duration, rollback triggers)",
      "Run GameDay exercises where cross-functional teams inject failures in production or pre-production",
      "Observe system behavior under failure, comparing actual outcomes against steady-state hypotheses",
      "Document findings, fix weaknesses, and schedule recurring chaos experiments to prevent regression"
    ],
    "steps_zh": [
      "基于业务关键 SLI 和预期系统行为定义稳态假设",
      "设计带有明确爆炸半径限制（范围、持续时间、回滚触发器）的实验",
      "运行 GameDay 演练，跨职能团队在生产或预生产环境中注入故障",
      "观察故障下的系统行为，将实际结果与稳态假设进行比较",
      "记录发现、修复弱点并安排定期混沌实验以防止回归"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Steady State",
      "Inject",
      "Observe",
      "Restore"
    ],
    "viz_labels_zh": [
      "稳态基准",
      "注入故障",
      "观测",
      "恢复"
    ],
    "related": [
      "chaos-engineering",
      "slo-as-practice",
      "on-call-engineering",
      "runbook-automation",
      "progressive-delivery"
    ],
    "tags": [
      "chaos-engineering",
      "gameday",
      "blast-radius",
      "resilience",
      "operational-practice"
    ],
    "origin_author": "Netflix (Casey Rosenthal, Nora Jones), 2011-2017",
    "origin_source": "Chaos Engineering: System Resiliency in Practice (Casey Rosenthal & Nora Jones, O'Reilly 2020)",
    "origin_source_zh": "《混沌工程：实践中的系统韧性》（Casey Rosenthal & Nora Jones，O'Reilly 2020）",
    "complexity": "advanced",
    "when_to_use": [
      "Production systems where unknown failure modes pose significant business risk",
      "Organizations building confidence in disaster recovery and failover mechanisms",
      "Teams that have mature monitoring and incident response but lack proactive resilience validation",
      "Pre-event hardening (before peak traffic seasons, major launches, or compliance audits)"
    ],
    "when_to_use_zh": [
      "未知故障模式对业务构成重大风险的生产系统",
      "正在建立灾难恢复和故障转移机制信心的组织",
      "拥有成熟监控和事件响应但缺少主动韧性验证的团队",
      "活动前加固（高峰流量季、重大发布或合规审计之前）"
    ],
    "core_concepts": [
      "Steady-State Hypothesis: A measurable definition of normal system behavior that experiments aim to disprove under failure conditions",
      "Blast Radius Control: Limiting the scope of chaos experiments through targeted injection, short durations, and automated kill switches",
      "GameDay: A structured, time-boxed event where teams deliberately inject failures and practice incident response in a controlled setting",
      "Resilience Regression: Without recurring chaos experiments, systems silently lose resilience as code and infrastructure change over time"
    ],
    "core_concepts_zh": [
      "稳态假设：对正常系统行为的可量化定义，实验旨在故障条件下证伪该假设",
      "爆炸半径控制：通过针对性注入、短持续时间和自动终止开关限制混沌实验的范围",
      "GameDay：结构化的限时演练，团队在受控环境中故意注入故障并练习事件响应",
      "韧性退化：缺少定期混沌实验时，随着代码和基础设施的变更，系统会悄然失去韧性"
    ],
    "timeline": [
      [
        "2011",
        "Netflix launches Chaos Monkey to randomly terminate production instances in AWS"
      ],
      [
        "2014",
        "Netflix introduces Chaos Kong for simulating entire AWS region failures"
      ],
      [
        "2017",
        "Gremlin founded as the first commercial chaos engineering platform"
      ],
      [
        "2020",
        "Casey Rosenthal and Nora Jones publish Chaos Engineering (O'Reilly), formalizing GameDay and blast radius practices"
      ],
      [
        "2023",
        "Chaos engineering extends to AI/ML systems, testing model fallback and graceful degradation under inference failures"
      ]
    ],
    "timeline_zh": [
      [
        "2011",
        "Netflix 推出 Chaos Monkey，随机终止 AWS 中的生产实例"
      ],
      [
        "2014",
        "Netflix 引入 Chaos Kong，模拟整个 AWS 区域故障"
      ],
      [
        "2017",
        "Gremlin 成立，成为首个商业混沌工程平台"
      ],
      [
        "2020",
        "Casey Rosenthal 和 Nora Jones 出版《混沌工程》（O'Reilly），正式化 GameDay 和爆炸半径实践"
      ],
      [
        "2023",
        "混沌工程扩展至 AI/ML 系统，测试推理故障下的模型回退和优雅降级"
      ]
    ],
    "dos": [
      "Start with the smallest possible blast radius and expand gradually as team confidence grows",
      "Always have a clear rollback plan and kill switch before starting any chaos experiment",
      "Involve on-call engineers and SREs in GameDay planning to build real incident response muscle memory",
      "Document every experiment's hypothesis, execution, findings, and follow-up actions in a shared runbook"
    ],
    "dos_zh": [
      "从最小可能的爆炸半径开始，随着团队信心增长逐步扩大",
      "在启动任何混沌实验前始终准备好明确的回滚计划和终止开关",
      "让值班工程师和 SRE 参与 GameDay 规划，建立真实的事件响应肌肉记忆",
      "在共享运行手册中记录每个实验的假设、执行、发现和后续行动"
    ],
    "donts": [
      "Don't run chaos experiments without stakeholder buy-in — surprise failures erode organizational trust",
      "Don't inject failures during known peak traffic or business-critical windows without explicit approval",
      "Don't treat chaos engineering as a one-time exercise — resilience degrades continuously and must be tested regularly",
      "Don't skip the hypothesis step — running random failures without a measurable hypothesis yields noise, not insight"
    ],
    "donts_zh": [
      "不要在没有利益相关者认可的情况下运行混沌实验——意外故障会侵蚀组织信任",
      "不要在已知高峰流量或业务关键窗口期注入故障而未获得明确批准",
      "不要将混沌工程视为一次性活动——韧性会持续退化，必须定期测试",
      "不要跳过假设步骤——没有可量化假设的随机故障注入产生的是噪音而非洞察"
    ],
    "case_study_company": "Amazon",
    "case_study": "Amazon runs regular GameDay exercises across its retail and AWS infrastructure, simulating region-level failures and dependency outages. During a 2019 GameDay, teams discovered that a critical payment service had an undocumented dependency on a caching layer that would cause cascading failures during an AZ outage. The finding was fixed before Prime Day, preventing what could have been a multi-million dollar incident during peak shopping hours.",
    "case_study_zh": "亚马逊在其零售和 AWS 基础设施中定期运行 GameDay 演练，模拟区域级故障和依赖中断。在 2019 年的一次 GameDay 中，团队发现一个关键支付服务对缓存层有未记录的依赖，在可用区故障期间会导致级联失败。该问题在 Prime Day 之前被修复，避免了高峰购物时段可能造成的数百万美元级事故。",
    "when_not_to_use": [
      "Systems with no monitoring or observability — you cannot learn from chaos you cannot observe",
      "Early-stage startups where basic reliability engineering (CI/CD, testing, monitoring) is not yet in place",
      "Environments with no rollback capability where injected failures could cause unrecoverable damage",
      "Highly regulated systems where unplanned outages, even controlled ones, violate compliance requirements"
    ],
    "when_not_to_use_zh": [
      "没有监控或可观测性的系统——无法从观测不到的混沌中学习",
      "基础可靠性工程（CI/CD、测试、监控）尚未建立的早期创业公司",
      "没有回滚能力、注入故障可能造成不可恢复损坏的环境",
      "非计划中断（即使是受控的）违反合规要求的强监管系统"
    ],
    "adopters": [
      "Netflix",
      "Amazon",
      "Google",
      "Gremlin",
      "Shopify"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Casey Rosenthal and Nora Jones (2020). \"Chaos Engineering: System Resiliency in Practice\". O'Reilly Media.",
    "secondary_sources": [
      "Netflix Technology Blog (2011). \"The Netflix Simian Army\". netflixtechblog.com.",
      "Ali Basiri et al. (2016). \"Chaos Engineering\". IEEE Software, 33(3)."
    ],
    "typed_relations": [
      {
        "slug": "chaos-engineering",
        "type": "extends"
      },
      {
        "slug": "slo-as-practice",
        "type": "complement"
      },
      {
        "slug": "on-call-engineering",
        "type": "complement"
      },
      {
        "slug": "runbook-automation",
        "type": "complement"
      },
      {
        "slug": "progressive-delivery",
        "type": "complement"
      }
    ]
  },
  {
    "id": 186,
    "name": "Platform as a Product",
    "name_zh": "平台即产品",
    "slug": "platform-as-a-product",
    "category": "deployment",
    "desc": "Treat internal platforms like products with users and roadmaps",
    "desc_zh": "将内部平台视为拥有用户和路线图的产品来运营",
    "steps": [
      "Identify internal developers as the platform's primary customers and conduct user research",
      "Define a product vision and roadmap for the platform based on developer pain points and organizational goals",
      "Build self-service capabilities with golden paths, documentation, and developer portals",
      "Measure platform adoption, developer satisfaction (DevEx surveys), and time-to-production as key metrics",
      "Iterate on the platform based on feedback loops, treating feature requests like a product backlog"
    ],
    "steps_zh": [
      "将内部开发者视为平台的主要客户并进行用户研究",
      "基于开发者痛点和组织目标为平台制定产品愿景和路线图",
      "构建自助服务能力，包含最佳路径、文档和开发者门户",
      "将平台采用率、开发者满意度（DevEx 调查）和上线时间作为关键指标",
      "基于反馈循环迭代平台，将功能请求作为产品待办事项管理"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Discover",
      "Build",
      "Deliver",
      "Measure"
    ],
    "viz_labels_zh": [
      "需求发现",
      "平台构建",
      "交付使用",
      "效果度量"
    ],
    "related": [
      "team-topologies",
      "developer-experience-framework",
      "infrastructure-as-code",
      "gitops"
    ],
    "tags": [
      "platform-engineering",
      "product-thinking",
      "developer-experience",
      "internal-platform",
      "team-topologies"
    ],
    "origin_author": "Evan Bottcher (ThoughtWorks), 2018; expanded by Team Topologies (Skelton & Pais, 2019)",
    "origin_source": "Team Topologies (Matthew Skelton & Manuel Pais, 2019); What I Talk About When I Talk About Platforms (Evan Bottcher, martinfowler.com, 2018)",
    "origin_source_zh": "《团队拓扑》（Matthew Skelton & Manuel Pais，2019）；《当我谈论平台时我在谈什么》（Evan Bottcher，martinfowler.com，2018）",
    "complexity": "advanced",
    "when_to_use": [
      "Organizations with more than 5-10 product teams that share common infrastructure and tooling needs",
      "Environments where developer productivity is bottlenecked by inconsistent or manual infrastructure provisioning",
      "Companies transitioning from project-based IT to product-based engineering teams",
      "Enterprises where cognitive load on stream-aligned teams is unsustainably high due to infrastructure complexity"
    ],
    "when_to_use_zh": [
      "拥有 5-10 个以上共享通用基础设施和工具需求的产品团队的组织",
      "开发者生产力因基础设施供给不一致或手动操作而成为瓶颈的环境",
      "从项目制 IT 转型为产品制工程团队的公司",
      "由于基础设施复杂性导致流对齐团队认知负荷不可持续的企业"
    ],
    "core_concepts": [
      "Thinnest Viable Platform: Build the smallest platform that accelerates teams, avoiding the trap of over-engineering a platform nobody asked for",
      "Golden Paths: Opinionated, well-supported default workflows that handle 80% of use cases, with escape hatches for the remaining 20%",
      "Developer as Customer: Apply product management practices — user research, satisfaction metrics, roadmapping — to internal platform development",
      "Platform Team as Enabling Team: The platform team's success is measured by the productivity and autonomy of the teams it serves, not by its own output"
    ],
    "core_concepts_zh": [
      "最薄可行平台：构建加速团队的最小平台，避免过度工程化一个没人需要的平台",
      "最佳路径：有主见的、良好支持的默认工作流，处理 80% 的用例，为其余 20% 提供逃生通道",
      "开发者即客户：将产品管理实践——用户研究、满意度指标、路线图——应用于内部平台开发",
      "平台团队即赋能团队：平台团队的成功以其服务的团队的生产力和自主性来衡量，而非自身产出"
    ],
    "timeline": [
      [
        "2018",
        "Evan Bottcher publishes 'What I Talk About When I Talk About Platforms' on martinfowler.com"
      ],
      [
        "2019",
        "Skelton and Pais formalize platform teams in Team Topologies, defining the platform as a product model"
      ],
      [
        "2021",
        "Humanitec, Backstage (Spotify), and Port emerge as developer portal and platform orchestration tools"
      ],
      [
        "2023",
        "Gartner identifies platform engineering as a top strategic technology trend"
      ],
      [
        "2024",
        "CNCF publishes Platform Engineering Maturity Model; platform-as-a-product becomes industry standard practice"
      ]
    ],
    "timeline_zh": [
      [
        "2018",
        "Evan Bottcher 在 martinfowler.com 发表《当我谈论平台时我在谈什么》"
      ],
      [
        "2019",
        "Skelton 和 Pais 在《团队拓扑》中正式化平台团队，定义平台即产品模型"
      ],
      [
        "2021",
        "Humanitec、Backstage（Spotify）和 Port 涌现为开发者门户和平台编排工具"
      ],
      [
        "2023",
        "Gartner 将平台工程列为年度十大战略技术趋势"
      ],
      [
        "2024",
        "CNCF 发布平台工程成熟度模型；平台即产品成为行业标准实践"
      ]
    ],
    "dos": [
      "Conduct regular developer satisfaction surveys and use NPS/CSAT to track platform health",
      "Start with a thin platform that solves one or two critical pain points and expand based on demand",
      "Publish a public roadmap and changelog so consuming teams can plan around platform changes",
      "Staff the platform team with product management and UX skills, not just infrastructure engineers"
    ],
    "dos_zh": [
      "定期进行开发者满意度调查，使用 NPS/CSAT 跟踪平台健康状况",
      "从解决一两个关键痛点的薄平台开始，根据需求逐步扩展",
      "发布公开路线图和变更日志，让消费团队可以围绕平台变更进行规划",
      "在平台团队中配备产品管理和用户体验人才，而非仅有基础设施工程师"
    ],
    "donts": [
      "Don't mandate platform adoption — if the platform is good, teams will choose it; forced adoption breeds resentment",
      "Don't build the platform in isolation from its users — co-create with embedded stream-aligned team members",
      "Don't measure platform success by features shipped — measure it by developer productivity and satisfaction",
      "Don't let the platform become a bottleneck — if teams have to file tickets and wait, you've built a service desk, not a platform"
    ],
    "donts_zh": [
      "不要强制采用平台——如果平台足够好，团队会主动选择；强制采用会滋生抵触",
      "不要在与用户隔离的情况下构建平台——与嵌入的流对齐团队成员共同创建",
      "不要以交付的功能数量衡量平台成功——以开发者生产力和满意度来衡量",
      "不要让平台成为瓶颈——如果团队必须提工单等待，你构建的是服务台而非平台"
    ],
    "case_study_company": "Spotify",
    "case_study": "Spotify built Backstage as an internal developer portal to treat their platform as a product. By 2020, Backstage unified over 100 internal tools behind a single interface with a plugin architecture. Developers could provision infrastructure, view service health, and access documentation from one place. Spotify open-sourced Backstage in 2020, and it was accepted into the CNCF in 2022, becoming the de facto standard for developer portals across the industry.",
    "case_study_zh": "Spotify 构建了 Backstage 作为内部开发者门户，将其平台视为产品来运营。到 2020 年，Backstage 通过插件架构将 100 多个内部工具统一在单一界面后。开发者可以在一个地方配置基础设施、查看服务健康状况和访问文档。Spotify 于 2020 年开源 Backstage，2022 年被 CNCF 接收，成为全行业开发者门户的事实标准。",
    "when_not_to_use": [
      "Small organizations with fewer than 5 teams where the overhead of a dedicated platform team is unjustified",
      "Early-stage startups where speed of product iteration matters more than developer experience standardization",
      "Organizations where all teams use identical, simple tech stacks that don't benefit from platform abstraction",
      "Environments where executive sponsorship for long-term platform investment is absent"
    ],
    "when_not_to_use_zh": [
      "团队少于 5 个的小型组织——专职平台团队的开销不合理",
      "产品迭代速度比开发者体验标准化更重要的早期创业公司",
      "所有团队使用相同且简单的技术栈、不需要平台抽象的组织",
      "缺乏对长期平台投资的高管支持的环境"
    ],
    "adopters": [
      "Spotify",
      "Zalando",
      "Mercedes-Benz",
      "DoorDash",
      "Humanitec"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "usability",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Evan Bottcher (2018). \"What I Talk About When I Talk About Platforms\". martinfowler.com.",
    "secondary_sources": [
      "Matthew Skelton and Manuel Pais (2019). \"Team Topologies: Organizing Business and Technology Teams for Fast Flow\". IT Revolution Press.",
      "Gregor Hohpe (2020). \"The Software Architect Elevator\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "team-topologies",
        "type": "complement"
      },
      {
        "slug": "developer-experience-framework",
        "type": "complement"
      },
      {
        "slug": "infrastructure-as-code",
        "type": "complement"
      },
      {
        "slug": "gitops",
        "type": "complement"
      }
    ]
  },
  {
    "id": 290,
    "name": "Feature Environments",
    "name_zh": "功能环境",
    "slug": "feature-environments",
    "category": "deployment",
    "desc": "Ephemeral full-stack environments provisioned per pull request or branch",
    "desc_zh": "为每个拉取请求或分支自动创建临时全栈环境",
    "steps": [
      "Configure CI/CD to automatically provision a named environment on pull request open, using the branch name as a namespace or subdomain",
      "Deploy the full application stack (frontend, backend, database seed) into the isolated environment within the CI pipeline",
      "Post the environment URL as a PR comment so reviewers and QA can access it without local setup",
      "Run automated tests (smoke, E2E, visual regression) against the ephemeral environment as part of the PR checks",
      "Automatically tear down the environment on PR close or merge to reclaim infrastructure resources"
    ],
    "steps_zh": [
      "配置 CI/CD 在拉取请求打开时使用分支名作为命名空间或子域名自动创建命名环境",
      "在 CI 流水线中将完整应用栈（前端、后端、数据库种子）部署到隔离环境",
      "将环境 URL 作为 PR 评论发布，使审查者和 QA 无需本地设置即可访问",
      "在 PR 检查中对临时环境执行自动化测试（冒烟测试、E2E 测试、视觉回归测试）",
      "在 PR 关闭或合并时自动销毁环境以回收基础设施资源"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Branch",
      "Spin Up",
      "Test",
      "Tear Down"
    ],
    "viz_labels_zh": [
      "功能分支",
      "环境创建",
      "测试验证",
      "环境销毁"
    ],
    "related": [
      "gitops",
      "progressive-delivery",
      "blue-green-deployment",
      "infrastructure-as-code"
    ],
    "tags": [
      "ephemeral-environments",
      "preview-environments",
      "pull-request",
      "ci-cd",
      "deployment"
    ],
    "origin_author": "Vercel",
    "origin_source": "Vercel (2018). \"Preview Deployments\". vercel.com/docs.",
    "origin_source_zh": "Vercel（2018）。《预览部署》。vercel.com/docs。",
    "complexity": "intermediate",
    "when_to_use": [
      "Frontend and full-stack teams that want product managers and designers to review changes without running code locally",
      "Projects with complex integration dependencies where local development environment setup is costly and error-prone",
      "Teams practicing trunk-based development where frequent PRs need isolated validation before merge",
      "Organizations that want to shift QA and stakeholder review earlier in the development cycle"
    ],
    "when_to_use_zh": [
      "希望产品经理和设计师无需本地运行代码即可审查变更的前端和全栈团队",
      "本地开发环境搭建成本高且容易出错的复杂集成依赖项目",
      "实践主干开发、频繁 PR 需要在合并前进行隔离验证的团队",
      "希望将 QA 和利益相关者评审提前到开发周期更早阶段的组织"
    ],
    "core_concepts": [
      "Ephemeral Namespace Isolation: Each PR gets a dedicated namespace, subdomain, or cluster namespace that is fully isolated from staging and production",
      "Infrastructure-on-Demand: Environments are created and destroyed programmatically by CI events, treating compute as a disposable resource",
      "Shift-Left Review: Product stakeholders, designers, and QA can validate running software early, catching requirement gaps before merging",
      "Hermetic Test Execution: E2E and integration tests run against a realistic stack that mirrors production, eliminating environment parity bugs"
    ],
    "core_concepts_zh": [
      "临时命名空间隔离：每个 PR 获得一个专用命名空间、子域名或集群命名空间，与暂存和生产环境完全隔离",
      "按需基础设施：环境由 CI 事件以编程方式创建和销毁，将计算视为可丢弃资源",
      "左移评审：产品利益相关者、设计师和 QA 可以在早期验证运行中的软件，在合并前发现需求差距",
      "密封测试执行：E2E 和集成测试在镜像生产环境的真实栈上运行，消除环境一致性错误"
    ],
    "timeline": [
      [
        "2016",
        "Heroku introduces Review Apps — per-PR ephemeral environments — as a first-class platform feature"
      ],
      [
        "2018",
        "Vercel launches Preview Deployments, making per-PR frontend environments the default for Jamstack projects"
      ],
      [
        "2020",
        "Netlify, Render, and Railway popularize the pattern across the full-stack ecosystem"
      ],
      [
        "2023",
        "GitHub Environments and tools like Uffizzi, Bunnyshell, and Qovery extend ephemeral environments to full Kubernetes stacks"
      ]
    ],
    "timeline_zh": [
      [
        "2016",
        "Heroku 推出 Review Apps——每个 PR 的临时环境——作为平台一等功能"
      ],
      [
        "2018",
        "Vercel 推出预览部署，使每个 PR 的前端环境成为 Jamstack 项目的默认选项"
      ],
      [
        "2020",
        "Netlify、Render 和 Railway 在全栈生态系统中普及了这一模式"
      ],
      [
        "2023",
        "GitHub Environments 和 Uffizzi、Bunnyshell、Qovery 等工具将临时环境扩展到完整的 Kubernetes 栈"
      ]
    ],
    "dos": [
      "Do seed ephemeral databases with anonymized production snapshots or realistic fixtures so that reviewers test against representative data",
      "Do set TTLs and maximum lifetime limits on environments to prevent zombie environments that consume resources indefinitely",
      "Do cache container image layers and dependency installs aggressively to keep provisioning time under 2 minutes",
      "Do post the environment URL and a teardown timestamp directly in the PR comment so reviewers know both where to test and how long they have"
    ],
    "dos_zh": [
      "务必使用匿名化生产快照或真实数据填充临时数据库，使审查者在具有代表性的数据上测试",
      "务必为环境设置 TTL 和最大生命周期限制，防止僵尸环境无限期消耗资源",
      "务必积极缓存容器镜像层和依赖安装，使配置时间保持在2分钟以内",
      "务必在 PR 评论中直接发布环境 URL 和销毁时间戳，使审查者知道在哪里测试以及还有多长时间"
    ],
    "donts": [
      "Don't give ephemeral environments access to production databases or secrets — use isolated seed data to prevent accidental data mutation",
      "Don't run ephemeral environments against shared staging backends, because concurrent PRs will interfere with each other's test state",
      "Don't ignore environment provisioning failures in CI — a silent environment creation error means reviewers test against stale or nonexistent deployments",
      "Don't skip environment teardown automation — manual cleanup creates a long tail of forgotten environments that silently inflate cloud bills"
    ],
    "donts_zh": [
      "不要让临时环境访问生产数据库或机密——使用隔离的种子数据防止意外数据变更",
      "不要在共享暂存后端上运行临时环境，因为并发 PR 会相互干扰测试状态",
      "不要忽略 CI 中的环境创建失败——静默的环境创建错误意味着审查者在过时或不存在的部署上测试",
      "不要跳过环境销毁自动化——手动清理会产生被遗忘的环境长尾，悄悄推高云账单"
    ],
    "case_study_company": "Vercel",
    "case_study": "Vercel made feature environments a core product differentiator by giving every Git push to a non-production branch its own unique preview URL. When a PR is opened on a Next.js project hosted on Vercel, a full deployment is triggered within 30–60 seconds, the URL is posted as a GitHub status check, and the deployment is automatically cleaned up on PR merge. This workflow became so popular that it effectively set the industry standard for frontend preview environments, with Netlify, Cloudflare Pages, and AWS Amplify all shipping equivalent capabilities within two years.",
    "case_study_zh": "Vercel 通过为每次推送到非生产分支的 Git 提交提供独特预览 URL，将功能环境打造为核心产品差异化优势。当在 Vercel 托管的 Next.js 项目上打开 PR 时，30–60 秒内触发完整部署，URL 作为 GitHub 状态检查发布，部署在 PR 合并时自动清理。这一工作流大受欢迎，有效地设定了前端预览环境的行业标准，Netlify、Cloudflare Pages 和 AWS Amplify 均在两年内推出了同等能力。",
    "when_not_to_use": [
      "Applications with extremely expensive infrastructure stacks where provisioning a full environment per PR is cost-prohibitive",
      "Monoliths with long build times where environment provisioning exceeds 15 minutes, creating more friction than value",
      "Teams with very low PR volume where the overhead of building and maintaining the ephemeral environment pipeline exceeds the review benefit",
      "Regulated environments where ephemeral environments cannot meet compliance requirements for data residency or access control"
    ],
    "when_not_to_use_zh": [
      "基础设施栈极其昂贵、为每个 PR 创建完整环境成本过高的应用",
      "构建时间过长、环境配置超过15分钟的单体应用——产生的摩擦多于价值",
      "PR 量极低的团队——构建和维护临时环境流水线的开销超过评审收益",
      "临时环境无法满足数据驻留或访问控制合规要求的受监管环境"
    ],
    "adopters": [
      "Vercel",
      "Netlify",
      "Heroku",
      "Shopify",
      "GitHub",
      "Stripe"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Vercel (2018). \"Preview Deployments\". vercel.com/docs.",
    "secondary_sources": [
      "Heroku (2016). \"Review Apps Documentation\". devcenter.heroku.com.",
      "Nader Dabit (2021). \"Preview Environments for Every Pull Request\". AWS Amplify blog.",
      "Adam Zimman (2022). \"What Are Feature Environments?\". LaunchDarkly blog."
    ],
    "typed_relations": [
      {
        "slug": "gitops",
        "type": "complement"
      },
      {
        "slug": "progressive-delivery",
        "type": "complement"
      },
      {
        "slug": "infrastructure-as-code",
        "type": "complement"
      },
      {
        "slug": "blue-green-deployment",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 291,
    "name": "Deployment Stamps Pattern",
    "name_zh": "部署印章模式",
    "slug": "deployment-stamps-pattern",
    "category": "deployment",
    "desc": "Scale by deploying multiple isolated copies of the application stack per tenant or region",
    "desc_zh": "通过为每个租户或地区部署多个隔离的应用栈副本来扩展规模",
    "steps": [
      "Define the stamp unit — the full set of resources (compute, database, cache, networking) that constitutes one isolated deployment",
      "Templatize the stamp using infrastructure-as-code so that each stamp is provisioned identically from the same template",
      "Build a routing layer (geo-DNS, tenant lookup service) that maps each customer or region to their assigned stamp",
      "Deploy new stamps for new tenants or regions; scale within a stamp until it reaches its defined capacity ceiling",
      "Operate stamps independently — updates, incidents, and capacity changes on one stamp do not affect others"
    ],
    "steps_zh": [
      "定义印章单元——构成一个隔离部署的完整资源集合（计算、数据库、缓存、网络）",
      "使用基础设施即代码对印章进行模板化，使每个印章从相同模板以相同方式创建",
      "构建路由层（地理 DNS、租户查找服务）将每个客户或地区映射到其分配的印章",
      "为新租户或地区部署新印章；在印章内扩展直到达到其定义的容量上限",
      "独立运营印章——一个印章上的更新、事故和容量变更不影响其他印章"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Region A",
      "Region B",
      "Stamp Config",
      "Routing"
    ],
    "viz_labels_zh": [
      "区域A",
      "区域B",
      "印模配置",
      "路由"
    ],
    "related": [
      "infrastructure-as-code",
      "gitops",
      "blue-green-deployment",
      "platform-engineering"
    ],
    "tags": [
      "stamps",
      "multi-tenant",
      "scaling",
      "isolation",
      "cloud-architecture"
    ],
    "origin_author": "Microsoft Azure",
    "origin_source": "Microsoft Azure Architecture Center (2019). \"Deployment Stamps pattern\". learn.microsoft.com.",
    "origin_source_zh": "微软 Azure 架构中心（2019）。《部署印章模式》。learn.microsoft.com。",
    "complexity": "advanced",
    "when_to_use": [
      "SaaS applications requiring strong data isolation between enterprise customers for compliance or contractual reasons",
      "Global services where latency requirements mandate co-locating compute and data near each customer region",
      "Systems that must scale beyond the limits of a single database or infrastructure stack without re-architecting the application",
      "Applications with highly variable per-tenant load profiles where noisy-neighbor effects on a shared stack would degrade SLAs"
    ],
    "when_to_use_zh": [
      "出于合规或合同原因需要在企业客户之间强隔离数据的 SaaS 应用",
      "延迟要求强制在每个客户地区附近部署计算和数据的全球服务",
      "必须在不重新架构应用的情况下扩展超过单一数据库或基础设施栈限制的系统",
      "每租户负载特征高度可变、共享栈上的吵闹邻居效应会降低 SLA 的应用"
    ],
    "core_concepts": [
      "Stamp Unit: A stamp is the smallest independently deployable unit of the full application stack; it is self-contained and shares no mutable state with other stamps",
      "Template Consistency: Every stamp is provisioned from the same IaC template, ensuring configuration parity and eliminating stamp-specific drift",
      "Tenant-to-Stamp Routing: A routing layer maps each tenant or region identifier to its assigned stamp; this layer is the only shared component",
      "Blast Radius Containment: Incidents, performance degradation, or failed deployments on one stamp are fully contained and do not propagate to other stamps"
    ],
    "core_concepts_zh": [
      "印章单元：印章是完整应用栈中最小的独立可部署单元；它是自包含的，与其他印章不共享可变状态",
      "模板一致性：每个印章从相同的 IaC 模板创建，确保配置一致性并消除印章特有漂移",
      "租户到印章路由：路由层将每个租户或地区标识符映射到其分配的印章；该层是唯一共享的组件",
      "爆炸半径控制：一个印章上的事故、性能降级或失败部署完全被隔离，不会传播到其他印章"
    ],
    "timeline": [
      [
        "2010",
        "Large SaaS providers (Salesforce, Workday) use isolated tenant stacks informally to meet enterprise data isolation requirements"
      ],
      [
        "2019",
        "Microsoft Azure Architecture Center formalizes and publishes the Deployment Stamps pattern with reference implementations"
      ],
      [
        "2021",
        "Kubernetes multi-tenancy tooling (vcluster, Capsule) enables lightweight stamp-per-tenant isolation within shared clusters"
      ],
      [
        "2023",
        "Cell-based architecture (Amazon, Slack) emerges as a variant of the stamps pattern optimized for fault isolation at scale"
      ]
    ],
    "timeline_zh": [
      [
        "2010",
        "大型 SaaS 提供商（Salesforce、Workday）非正式地使用隔离租户栈满足企业数据隔离要求"
      ],
      [
        "2019",
        "微软 Azure 架构中心正式化并发布部署印章模式及参考实现"
      ],
      [
        "2021",
        "Kubernetes 多租户工具（vcluster、Capsule）在共享集群内实现轻量级每租户印章隔离"
      ],
      [
        "2023",
        "基于单元的架构（Amazon、Slack）作为印章模式的变体出现，针对规模故障隔离进行了优化"
      ]
    ],
    "dos": [
      "Do automate stamp provisioning end-to-end from IaC templates so that a new stamp can be deployed in minutes without manual steps",
      "Do implement a centralized stamp registry that tracks which tenant maps to which stamp, their capacity utilization, and health status",
      "Do version stamp templates independently from application code so infrastructure changes can be applied across stamps in controlled waves",
      "Do design stamps to be horizontally scalable within their bounds so you can right-size before provisioning a new stamp"
    ],
    "dos_zh": [
      "务必从 IaC 模板端到端自动化印章配置，使新印章可以在数分钟内无需手动步骤地部署",
      "务必实现一个集中式印章注册表，跟踪哪个租户映射到哪个印章、其容量利用率和健康状态",
      "务必独立于应用代码对印章模板进行版本控制，使基础设施变更可以以受控方式应用于各印章",
      "务必将印章设计为在其边界内水平可扩展，以便在创建新印章前进行适当调整"
    ],
    "donts": [
      "Don't allow stamps to share mutable state (databases, caches, message queues) — shared state defeats the isolation guarantee and reintroduces noisy-neighbor risk",
      "Don't manually configure individual stamps — every deviation from the template becomes undocumented drift that causes future incidents",
      "Don't over-provision stamps from day one — use capacity modeling to right-size the stamp unit and grow the number of stamps rather than the size",
      "Don't use the stamps pattern as a substitute for good application architecture — it scales isolation, not poorly-written code"
    ],
    "donts_zh": [
      "不要让印章共享可变状态（数据库、缓存、消息队列）——共享状态破坏了隔离保证并重新引入吵闹邻居风险",
      "不要手动配置单个印章——每个与模板的偏差都成为未记录的漂移，导致未来事故",
      "不要从第一天就过度配置印章——使用容量建模适当调整印章单元大小，通过增加印章数量而非印章大小来扩展",
      "不要将印章模式用作良好应用架构的替代品——它扩展的是隔离，而非编写糟糕的代码"
    ],
    "case_study_company": "GitHub",
    "case_study": "GitHub uses a stamps-like deployment model for GitHub Enterprise Server, where each enterprise customer runs their own isolated stack — compute, storage, and networking — deployed from the same release artifact. This architecture ensures that a performance problem or infrastructure incident at one enterprise installation has zero impact on other customers. GitHub also uses this pattern internally for its own reliability zones, with independent stacks per region that can absorb a full zone failure without cross-zone blast radius.",
    "case_study_zh": "GitHub 为 GitHub Enterprise Server 使用类印章的部署模型，每个企业客户运行自己独立的栈——计算、存储和网络——从相同的发布制品部署。此架构确保一个企业安装中的性能问题或基础设施事故对其他客户零影响。GitHub 内部也将此模式用于其自身的可靠性区域，每个地区有独立的栈，可以在无跨区域爆炸半径的情况下吸收整个区域故障。",
    "when_not_to_use": [
      "Consumer SaaS with millions of end-user accounts where per-user stamp isolation is economically infeasible",
      "Applications with strong cross-tenant data sharing requirements where isolation prevents legitimate data access patterns",
      "Small teams without the platform engineering maturity to build, operate, and version the stamp provisioning pipeline",
      "Systems with uniform, predictable load where a single well-scaled stack is more cost-effective than many small stamps"
    ],
    "when_not_to_use_zh": [
      "拥有数百万终端用户账户的消费者 SaaS——每用户印章隔离在经济上不可行",
      "具有强跨租户数据共享需求、隔离阻止合法数据访问模式的应用",
      "缺乏构建、运营和版本化印章配置流水线的平台工程成熟度的小团队",
      "负载均匀可预测的系统——单一良好扩展的栈比许多小印章更具成本效益"
    ],
    "adopters": [
      "Microsoft Azure",
      "GitHub",
      "Salesforce",
      "SAP",
      "Workday",
      "Twilio"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "security"
    ],
    "maturity_ring": "established",
    "primary_source": "Microsoft Azure Architecture Center (2019). \"Deployment Stamps pattern\". learn.microsoft.com/azure/architecture/patterns/deployment-stamp.",
    "secondary_sources": [
      "Colm MacCarthaigh (2022). \"Avoiding fallback in distributed systems\". AWS builder's library. aws.amazon.com.",
      "Slack Engineering (2022). \"Cell-based architecture\". slack.engineering.",
      "Gregor Hohpe (2020). \"The Software Architect Elevator\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "infrastructure-as-code",
        "type": "complement"
      },
      {
        "slug": "gitops",
        "type": "complement"
      },
      {
        "slug": "blue-green-deployment",
        "type": "complement"
      },
      {
        "slug": "platform-engineering",
        "type": "complement"
      }
    ]
  },
  {
    "id": 292,
    "name": "Infrastructure as Code Maturity Model",
    "name_zh": "基础设施即代码成熟度模型",
    "slug": "iac-maturity-model",
    "category": "deployment",
    "desc": "Staged progression from manual infrastructure to fully automated self-service IaC",
    "desc_zh": "从手动基础设施到完全自动化自服务 IaC 的阶段性演进",
    "steps": [
      "Assess current state: audit how infrastructure is provisioned today — manual console, scripts, partial IaC, or full automation",
      "Define target maturity level and identify the gaps between current and desired state for each infrastructure domain",
      "Introduce declarative IaC tooling (Terraform, Pulumi, CloudFormation) for net-new resources while coexisting with manual legacy",
      "Build CI/CD pipelines that plan, validate, and apply infrastructure changes automatically on merge",
      "Evolve toward self-service: platform teams publish reusable modules; product teams consume them without infrastructure knowledge"
    ],
    "steps_zh": [
      "评估当前状态：审计当前基础设施的配置方式——手动控制台、脚本、部分 IaC 或完全自动化",
      "定义目标成熟度级别，识别每个基础设施域当前状态与期望状态之间的差距",
      "对净新资源引入声明式 IaC 工具（Terraform、Pulumi、CloudFormation），同时与手动遗留系统共存",
      "构建在合并时自动计划、验证和应用基础设施变更的 CI/CD 流水线",
      "向自服务演进：平台团队发布可复用模块；产品团队无需基础设施知识即可使用"
    ],
    "ai_relevant": false,
    "viz_type": "pyramid",
    "viz_labels": [
      "Manual",
      "Scripted",
      "Automated",
      "Policy-Driven"
    ],
    "viz_labels_zh": [
      "手动操作",
      "脚本化",
      "自动化",
      "策略驱动"
    ],
    "related": [
      "infrastructure-as-code",
      "gitops",
      "platform-engineering",
      "deployment-stamps-pattern"
    ],
    "tags": [
      "iac",
      "maturity-model",
      "infrastructure",
      "automation",
      "self-service"
    ],
    "origin_author": "Kief Morris",
    "origin_source": "Kief Morris (2020). \"Infrastructure as Code: Dynamic Systems for the Cloud Age\" (2nd ed.). O'Reilly Media.",
    "origin_source_zh": "Kief Morris（2020）。《基础设施即代码：云时代的动态系统》（第2版）。O'Reilly Media。",
    "complexity": "intermediate",
    "when_to_use": [
      "Organizations beginning or deepening an IaC adoption journey who need a roadmap to guide investment and measure progress",
      "Platform engineering teams building a developer self-service layer who want to articulate the maturity levels to leadership",
      "Engineering managers conducting infrastructure capability audits before a cloud migration or re-platforming initiative",
      "Teams struggling with IaC drift, snowflake servers, or inconsistent environments who need a structured improvement path"
    ],
    "when_to_use_zh": [
      "开始或深化 IaC 采用旅程的组织，需要指导投资和衡量进展的路线图",
      "构建开发者自服务层的平台工程团队，希望向领导层阐明成熟度级别",
      "在云迁移或重新平台化举措前进行基础设施能力审计的工程经理",
      "在 IaC 漂移、雪花服务器或不一致环境方面挣扎、需要结构化改进路径的团队"
    ],
    "core_concepts": [
      "Maturity Levels: Typically five stages from Level 0 (fully manual) through Level 4 (dynamic self-service), each with defined characteristics and exit criteria",
      "Idempotency Principle: IaC at maturity means running the same script repeatedly produces the same outcome; drift is impossible by design",
      "Module Abstraction: Higher maturity levels abstract infrastructure complexity into reusable, versioned modules that encode organizational standards",
      "Policy as Code: Advanced maturity incorporates automated compliance validation (Open Policy Agent, Sentinel) into the IaC pipeline"
    ],
    "core_concepts_zh": [
      "成熟度级别：通常从0级（完全手动）到4级（动态自服务）分五个阶段，每个阶段都有明确的特征和退出标准",
      "幂等性原则：成熟的 IaC 意味着重复运行相同脚本产生相同结果；漂移在设计上不可能发生",
      "模块抽象：更高成熟度级别将基础设施复杂性抽象为可复用的、版本化的模块，这些模块编码了组织标准",
      "策略即代码：高级成熟度将自动化合规验证（Open Policy Agent、Sentinel）集成到 IaC 流水线中"
    ],
    "timeline": [
      [
        "2006",
        "CFEngine and early configuration management tools (Puppet, Chef) establish the principle of describing desired state as code"
      ],
      [
        "2014",
        "Terraform 0.1 released by HashiCorp; declarative cloud infrastructure provisioning becomes accessible to the broader engineering community"
      ],
      [
        "2016",
        "Kief Morris publishes the first edition of Infrastructure as Code, articulating maturity stages and engineering practices"
      ],
      [
        "2021",
        "Platform engineering movement frames IaC maturity as a prerequisite for developer self-service; CNCF publishes the Platform Engineering maturity model"
      ]
    ],
    "timeline_zh": [
      [
        "2006",
        "CFEngine 和早期配置管理工具（Puppet、Chef）确立了将期望状态描述为代码的原则"
      ],
      [
        "2014",
        "HashiCorp 发布 Terraform 0.1；声明式云基础设施配置对更广泛的工程社区变得可及"
      ],
      [
        "2016",
        "Kief Morris 出版《基础设施即代码》第一版，阐述成熟度阶段和工程实践"
      ],
      [
        "2021",
        "平台工程运动将 IaC 成熟度定位为开发者自服务的先决条件；CNCF 发布平台工程成熟度模型"
      ]
    ],
    "dos": [
      "Do start by codifying one well-understood, low-risk infrastructure domain (e.g., DNS records or S3 buckets) to prove the workflow before expanding scope",
      "Do enforce IaC linting and static analysis (tflint, checkov) in CI to catch configuration errors and security misconfigurations before apply",
      "Do use remote state backends with state locking to prevent concurrent apply operations from corrupting infrastructure state",
      "Do build a module registry with versioned, documented modules so that teams consume approved patterns rather than writing ad-hoc resources"
    ],
    "dos_zh": [
      "务必从编码一个易于理解、低风险的基础设施域（如 DNS 记录或 S3 存储桶）开始，在扩大范围前验证工作流",
      "务必在 CI 中强制执行 IaC 代码检查和静态分析（tflint、checkov），在应用前捕获配置错误和安全错误配置",
      "务必使用带状态锁定的远程状态后端，防止并发应用操作损坏基础设施状态",
      "务必构建带有版本化、文档化模块的模块注册表，使团队使用已批准的模式而非编写临时资源"
    ],
    "donts": [
      "Don't import existing manually-created resources into IaC state without first auditing and documenting their configuration, because undocumented resources become IaC liabilities",
      "Don't apply infrastructure changes directly from developer workstations in production — all changes must flow through CI/CD with plan review",
      "Don't treat IaC maturity as a binary milestone — incremental progress is more sustainable than attempting a big-bang migration of all infrastructure at once",
      "Don't confuse configuration management (Ansible, Chef) with infrastructure provisioning (Terraform, Pulumi) — they solve different problems at different layers"
    ],
    "donts_zh": [
      "不要在未先审计和记录配置的情况下将现有手动创建的资源导入 IaC 状态——未记录的资源成为 IaC 负担",
      "不要在生产环境中从开发者工作站直接应用基础设施变更——所有变更必须通过带计划审查的 CI/CD",
      "不要将 IaC 成熟度视为二元里程碑——增量进展比一次性迁移所有基础设施的大爆炸方式更可持续",
      "不要混淆配置管理（Ansible、Chef）与基础设施配置（Terraform、Pulumi）——它们在不同层解决不同问题"
    ],
    "case_study_company": "Monzo",
    "case_study": "Monzo, the UK digital bank, progressed from a manual AWS console-based setup in 2015 to a fully automated Terraform-based IaC platform by 2019. Their journey followed the maturity model closely: starting with scripts, moving to Terraform modules managed by a central platform team, and eventually enabling product engineers to self-provision infrastructure via an internal catalog. By Level 4, Monzo engineers could spin up a new microservice with a complete networking, IAM, and observability stack in under 10 minutes via a self-service portal.",
    "case_study_zh": "英国数字银行 Monzo 从2015年基于手动 AWS 控制台的设置，到2019年发展为完全自动化的基于 Terraform 的 IaC 平台。他们的旅程紧密遵循成熟度模型：从脚本开始，转向由中央平台团队管理的 Terraform 模块，最终通过内部目录使产品工程师能够自服务配置基础设施。到达4级时，Monzo 工程师可以通过自服务门户在10分钟内创建一个具有完整网络、IAM 和可观测性栈的新微服务。",
    "when_not_to_use": [
      "Proof-of-concept or hackathon environments where the overhead of IaC setup outweighs the short lifecycle of the infrastructure",
      "Organizations with a single engineer managing infrastructure where formalized maturity levels add bureaucratic overhead without team benefit",
      "Legacy infrastructure with undocumented, bespoke configurations where a maturity model assessment is premature before a discovery audit",
      "Environments locked to a single cloud provider's proprietary tooling where generic IaC maturity models may not map cleanly"
    ],
    "when_not_to_use_zh": [
      "IaC 设置开销超过基础设施短暂生命周期的概念验证或黑客马拉松环境",
      "只有一名工程师管理基础设施的组织——正式化成熟度级别增加官僚开销而无团队收益",
      "在发现审计之前，成熟度模型评估为时过早的具有未记录的、定制配置的遗留基础设施",
      "锁定于单一云提供商专有工具的环境——通用 IaC 成熟度模型可能无法干净映射"
    ],
    "adopters": [
      "HashiCorp",
      "Monzo",
      "Spotify",
      "Thoughtworks",
      "Atlassian",
      "Cloudflare"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Kief Morris (2020). \"Infrastructure as Code: Dynamic Systems for the Cloud Age\" (2nd ed.). O'Reilly Media.",
    "secondary_sources": [
      "HashiCorp (2021). \"Infrastructure as Code Maturity Model\". hashicorp.com.",
      "Thoughtworks Technology Radar (2021). \"Evolving Infrastructure as Code practices\". thoughtworks.com/radar.",
      "CNCF (2022). \"Platform Engineering Maturity Model\". tag-app-delivery.cncf.io."
    ],
    "typed_relations": [
      {
        "slug": "infrastructure-as-code",
        "type": "extends"
      },
      {
        "slug": "gitops",
        "type": "complement"
      },
      {
        "slug": "platform-engineering",
        "type": "complement"
      },
      {
        "slug": "deployment-stamps-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 73,
    "name": "Strangler Fig Pattern",
    "name_zh": "绞杀榕模式",
    "slug": "strangler-fig-pattern",
    "category": "evolution",
    "desc": "Incrementally replace a legacy system by routing traffic to new modules",
    "desc_zh": "通过逐步将流量路由至新模块，渐进式替换遗留系统",
    "steps": [
      "Identify the legacy system boundary and map all entry points (APIs, events, UI routes)",
      "Build a facade or proxy layer that intercepts all requests to the legacy system",
      "Incrementally implement new functionality behind the facade, feature by feature",
      "Redirect traffic from the facade to the new implementation one slice at a time",
      "Retire legacy code once all traffic has been migrated and verified stable"
    ],
    "steps_zh": [
      "识别遗留系统边界，映射所有入口点（API、事件、UI 路由）",
      "构建外观层或代理层，拦截所有流向遗留系统的请求",
      "在外观层后面逐项功能地增量实现新功能",
      "逐片将流量从外观层重定向至新实现",
      "所有流量迁移并验证稳定后，退役遗留代码"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Legacy",
      "Facade",
      "New Service",
      "Decommission"
    ],
    "viz_labels_zh": [
      "遗留系统",
      "外观层",
      "新服务",
      "下线旧系统"
    ],
    "related": [
      "branch-by-abstraction",
      "parallel-run",
      "microservices-decomposition"
    ],
    "tags": [
      "migration",
      "legacy",
      "incremental",
      "strangler-fig"
    ],
    "origin_author": "Martin Fowler, 2004",
    "origin_source": "StranglerFigApplication (martinfowler.com)",
    "origin_source_zh": "《绞杀榕应用》（martinfowler.com 博文）",
    "complexity": "intermediate",
    "when_to_use": [
      "Migrating a monolithic legacy system to microservices without a full rewrite",
      "Replacing an aging e-commerce platform while keeping it live for customers",
      "Gradually moving from an on-premises system to cloud-native services",
      "Modernizing a mainframe backend one domain at a time"
    ],
    "when_to_use_zh": [
      "将单体遗留系统迁移到微服务架构，而无需全面重写",
      "在保持客户正常使用的同时替换老化的电商平台",
      "从本地部署系统逐步迁移至云原生服务",
      "按业务领域逐步现代化大型机后端"
    ],
    "core_concepts": [
      "Facade/Proxy: An interception layer that sits between consumers and the legacy system, enabling transparent traffic routing",
      "Incremental migration: Replace one vertical slice of functionality at a time rather than attempting a big-bang rewrite",
      "Traffic routing: Gradually shift requests from old to new implementation using load balancers, API gateways, or URL rewrites",
      "Asset capture: Each migrated slice permanently removes a piece of legacy code, like a strangler fig enveloping its host tree",
      "Rollback safety: The facade can instantly revert traffic to the legacy path if the new implementation exhibits issues"
    ],
    "core_concepts_zh": [
      "外观/代理层：位于消费者与遗留系统之间的拦截层，实现透明的流量路由",
      "增量迁移：每次替换一个纵向功能切片，而非尝试一次性大爆炸重写",
      "流量路由：通过负载均衡器、API 网关或 URL 重写，逐步将请求从旧实现转移至新实现",
      "资产蚕食：每个已迁移的切片永久移除一段遗留代码，如同绞杀榕逐渐包裹宿主树",
      "回滚安全性：当新实现出现问题时，外观层可立即将流量恢复至遗留路径"
    ],
    "timeline": [
      [
        "2004",
        "Martin Fowler publishes the original Strangler Fig Application article"
      ],
      [
        "2012",
        "Pattern gains traction with the rise of microservices at companies like Netflix"
      ],
      [
        "2015",
        "Sam Newman highlights Strangler Fig in 'Building Microservices' as a key migration strategy"
      ],
      [
        "2019",
        "AWS and Azure publish official cloud migration guides featuring the Strangler Fig pattern"
      ],
      [
        "2023",
        "Pattern adopted for AI system migrations, replacing legacy ML pipelines with LLM-based services"
      ]
    ],
    "timeline_zh": [
      [
        "2004",
        "Martin Fowler 发表原始的绞杀榕应用文章"
      ],
      [
        "2012",
        "随着 Netflix 等公司推行微服务，该模式获得广泛关注"
      ],
      [
        "2015",
        "Sam Newman 在《构建微服务》中将绞杀榕作为核心迁移策略加以强调"
      ],
      [
        "2019",
        "AWS 和 Azure 在官方云迁移指南中正式纳入绞杀榕模式"
      ],
      [
        "2023",
        "该模式被应用于 AI 系统迁移，用基于 LLM 的服务替换遗留 ML 管线"
      ]
    ],
    "dos": [
      "Start with the highest-value, most well-understood slice to build team confidence early",
      "Invest in comprehensive monitoring and diff-comparison between old and new paths",
      "Keep the facade layer thin and stateless so it does not become a bottleneck itself",
      "Celebrate each retired legacy module to maintain team momentum"
    ],
    "dos_zh": [
      "从价值最高、最易理解的切片开始，尽早建立团队信心",
      "在新旧路径之间建立全面的监控和差异比对机制",
      "保持外观层轻薄且无状态，避免其自身成为瓶颈",
      "每退役一个遗留模块都值得庆祝，以维持团队动力"
    ],
    "donts": [
      "Don't attempt to migrate everything at once — the whole point is incremental progress",
      "Don't let the facade grow into a 'smart' middleware with business logic",
      "Don't neglect the legacy system's maintenance during migration — it's still serving live traffic",
      "Don't skip writing tests for the new implementation just because the legacy system 'already works'"
    ],
    "donts_zh": [
      "不要试图一次性迁移所有内容——增量推进是该模式的核心要义",
      "不要让外观层演变为包含业务逻辑的「智能」中间件",
      "在迁移期间不要忽视遗留系统的维护——它仍在处理线上流量",
      "不要因为遗留系统「已经能用」就跳过为新实现编写测试"
    ],
    "case_study_company": "The Guardian",
    "case_study": "The Guardian newspaper migrated its website from a legacy Java CMS to a modern Scala/Node.js stack between 2011 and 2014 using the Strangler Fig pattern. They placed an Nginx proxy in front of the old system and routed one section at a time (sports, then news, then opinion) to the new stack. The migration was invisible to readers, and the legacy CMS was fully decommissioned without a single day of downtime.",
    "case_study_zh": "《卫报》在 2011 年至 2014 年间使用绞杀榕模式将其网站从遗留 Java CMS 迁移至现代 Scala/Node.js 技术栈。他们在旧系统前部署 Nginx 代理，依次将各版块（先体育、再新闻、后评论）的流量切换至新系统。整个迁移对读者完全透明，遗留 CMS 在零停机的情况下被彻底退役。",
    "when_not_to_use": [
      "The legacy system is small enough that a clean rewrite can be completed in a single sprint",
      "There is no clear boundary or API surface to intercept between consumers and the legacy system",
      "The legacy system has no automated tests and its behavior cannot be reliably verified against the new system",
      "Organizational urgency demands an immediate cutover rather than gradual migration"
    ],
    "when_not_to_use_zh": [
      "遗留系统足够小，可以在单个迭代内完成干净的重写",
      "消费者与遗留系统之间没有清晰的边界或可拦截的 API 接口",
      "遗留系统没有自动化测试，其行为无法与新系统进行可靠比对",
      "组织紧迫性要求立即切换，而非渐进式迁移"
    ],
    "adopters": [
      "The Guardian",
      "Netflix",
      "Shopify",
      "GOV.UK",
      "Amazon"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Martin Fowler (2004). \"StranglerFigApplication\". martinfowler.com.",
    "secondary_sources": [
      "Sam Newman (2019). \"Monolith to Microservices: Evolutionary Patterns to Transform Your Monolith\". O'Reilly Media.",
      "Michael Feathers (2004). \"Working Effectively with Legacy Code\". Prentice Hall."
    ],
    "typed_relations": [
      {
        "slug": "branch-by-abstraction",
        "type": "complement"
      },
      {
        "slug": "parallel-run",
        "type": "complement"
      },
      {
        "slug": "microservices-decomposition",
        "type": "related"
      }
    ]
  },
  {
    "id": 74,
    "name": "Branch by Abstraction",
    "name_zh": "抽象分支法",
    "slug": "branch-by-abstraction",
    "category": "evolution",
    "desc": "Replace a component in-place via an abstraction layer without feature branches",
    "desc_zh": "通过抽象层原地替换组件，无需功能分支",
    "steps": [
      "Introduce an abstraction (interface or adapter) over the component to be replaced",
      "Make all callers depend on the abstraction rather than the concrete implementation",
      "Build the new implementation behind the same abstraction, deploying it to production dark",
      "Toggle traffic gradually from the old implementation to the new one using feature flags",
      "Remove the old implementation and the abstraction layer once migration is complete"
    ],
    "steps_zh": [
      "在待替换组件上引入抽象层（接口或适配器）",
      "使所有调用方依赖抽象而非具体实现",
      "在相同抽象后构建新实现，以暗部署方式发布到生产环境",
      "使用功能开关将流量从旧实现逐步切换至新实现",
      "迁移完成后移除旧实现与抽象层"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Abstraction",
      "Old Impl",
      "New Impl",
      "Switch"
    ],
    "viz_labels_zh": [
      "抽象层",
      "旧实现",
      "新实现",
      "切换"
    ],
    "related": [
      "strangler-fig-pattern",
      "feature-flags",
      "hexagonal-architecture"
    ],
    "tags": [
      "abstraction",
      "migration",
      "refactoring",
      "trunk-based"
    ],
    "origin_author": "Paul Hammant, 2007",
    "origin_source": "Branch by Abstraction (paulhammant.com)",
    "origin_source_zh": "《抽象分支法》（paulhammant.com 博文）",
    "complexity": "intermediate",
    "when_to_use": [
      "Replacing a core library or framework dependency (e.g., ORM, logging, HTTP client) without long-lived branches",
      "Migrating from one database vendor to another while keeping the application deployable",
      "Swapping out a payment gateway while both old and new providers must remain operational",
      "Trunk-based development teams that cannot afford long-running feature branches"
    ],
    "when_to_use_zh": [
      "替换核心库或框架依赖（如 ORM、日志、HTTP 客户端）而无需长期分支",
      "在保持应用可部署的同时从一个数据库供应商迁移到另一个",
      "在新旧支付网关都必须保持运行的情况下进行切换",
      "采用主干开发的团队无法承担长期运行的功能分支"
    ],
    "core_concepts": [
      "Abstraction layer: A thin interface or adapter inserted between callers and the component being replaced",
      "Dark deployment: The new implementation is deployed to production but not yet serving live traffic",
      "Feature flags: Runtime toggles that control which implementation receives requests",
      "Trunk-based development: All work happens on the main branch, avoiding merge conflicts from long-lived branches",
      "Incremental switchover: Traffic shifts gradually, enabling safe rollback at any point"
    ],
    "core_concepts_zh": [
      "抽象层：在调用方与待替换组件之间插入的薄接口或适配器",
      "暗部署：新实现已部署至生产环境，但尚未处理线上流量",
      "功能开关：控制哪个实现接收请求的运行时开关",
      "主干开发：所有工作在主分支上进行，避免长期分支带来的合并冲突",
      "增量切换：流量逐步转移，在任何时刻都可安全回滚"
    ],
    "timeline": [
      [
        "2007",
        "Paul Hammant coins the term 'Branch by Abstraction' on his blog"
      ],
      [
        "2010",
        "Jez Humble and David Farley reference the technique in 'Continuous Delivery'"
      ],
      [
        "2013",
        "Facebook engineering adopts the pattern for their Mercurial-to-Git migration tooling"
      ],
      [
        "2016",
        "ThoughtWorks Technology Radar recommends it as a standard migration practice"
      ],
      [
        "2020",
        "Pattern becomes a staple of trunk-based development guides and CI/CD best practices"
      ]
    ],
    "timeline_zh": [
      [
        "2007",
        "Paul Hammant 在博客上提出「抽象分支法」概念"
      ],
      [
        "2010",
        "Jez Humble 和 David Farley 在《持续交付》中引用该技术"
      ],
      [
        "2013",
        "Facebook 工程团队在 Mercurial 到 Git 的迁移工具中采用该模式"
      ],
      [
        "2016",
        "ThoughtWorks 技术雷达将其推荐为标准迁移实践"
      ],
      [
        "2020",
        "该模式成为主干开发指南和 CI/CD 最佳实践的核心内容"
      ]
    ],
    "dos": [
      "Keep the abstraction layer as thin as possible — it should only delegate, not contain logic",
      "Write adapter tests that run against both old and new implementations to ensure behavioral parity",
      "Use feature flags with kill-switch capability for instant rollback in production",
      "Communicate the migration timeline clearly so the team doesn't leave both implementations running indefinitely"
    ],
    "dos_zh": [
      "保持抽象层尽可能薄——它只应委派调用，不应包含逻辑",
      "编写适配器测试，同时运行新旧实现以确保行为一致性",
      "使用带有熔断功能的功能开关，以便在生产中即时回滚",
      "清晰传达迁移时间线，避免团队无限期地保留两套实现"
    ],
    "donts": [
      "Don't let the abstraction layer accumulate business logic — it should remain a pure delegation boundary",
      "Don't skip the step of migrating all callers to the abstraction before building the new implementation",
      "Don't leave the old implementation in the codebase after migration — dead code breeds confusion",
      "Don't use this pattern for UI-layer changes where a simple feature flag would suffice"
    ],
    "donts_zh": [
      "不要让抽象层积累业务逻辑——它应始终是纯粹的委派边界",
      "在构建新实现之前，不要跳过将所有调用方迁移到抽象层的步骤",
      "迁移完成后不要将旧实现留在代码库中——死代码会造成混乱",
      "不要将此模式用于简单功能开关即可解决的 UI 层变更"
    ],
    "case_study_company": "Flickr",
    "case_study": "Flickr used Branch by Abstraction to migrate from their original PHP-based image processing pipeline to a new Java-based service. They introduced an abstraction interface for image transformations, deployed the Java service dark alongside the existing PHP code, and gradually shifted traffic over several weeks. The migration completed with zero user-facing outages and eliminated their long-standing deployment bottleneck.",
    "case_study_zh": "Flickr 使用抽象分支法将原始的 PHP 图像处理管线迁移至新的 Java 服务。他们引入了图像变换的抽象接口，将 Java 服务以暗部署方式与现有 PHP 代码并行部署，并在数周内逐步切换流量。迁移过程中零用户可见故障，并消除了长期存在的部署瓶颈。",
    "when_not_to_use": [
      "The component has no clear API boundary and is deeply entangled with other modules",
      "The replacement will be done in a single commit with trivial scope",
      "The team is not practicing trunk-based development and prefers long-lived feature branches",
      "Performance-critical code paths where even a thin abstraction layer introduces unacceptable latency"
    ],
    "when_not_to_use_zh": [
      "待替换组件没有清晰的 API 边界，与其他模块深度耦合",
      "替换范围很小，可以在单次提交中完成",
      "团队未采用主干开发，而是偏好长期功能分支",
      "性能关键的代码路径中，即使薄抽象层也会引入不可接受的延迟"
    ],
    "adopters": [
      "Flickr",
      "Facebook",
      "Google",
      "ThoughtWorks",
      "Atlassian"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Paul Hammant (2007). \"Branch by Abstraction\". paulhammant.com.",
    "secondary_sources": [
      "Jez Humble and David Farley (2010). \"Continuous Delivery: Reliable Software Releases through Build, Test, and Deployment Automation\". Addison-Wesley.",
      "Martin Fowler (2014). \"BranchByAbstraction\". martinfowler.com."
    ],
    "typed_relations": [
      {
        "slug": "strangler-fig-pattern",
        "type": "complement"
      },
      {
        "slug": "feature-flags",
        "type": "complement"
      },
      {
        "slug": "hexagonal-architecture",
        "type": "complement"
      }
    ]
  },
  {
    "id": 75,
    "name": "Parallel Run",
    "name_zh": "并行运行模式",
    "slug": "parallel-run",
    "category": "evolution",
    "desc": "Run old and new implementations simultaneously and compare outputs for safety",
    "desc_zh": "同时运行新旧实现并对比输出，确保迁移安全性",
    "steps": [
      "Deploy both the legacy and new implementations into the same production environment",
      "Route all live requests to both systems simultaneously without exposing new results to users",
      "Capture and compare outputs from both systems, logging all discrepancies",
      "Investigate and resolve every discrepancy until the new system output matches the legacy",
      "Decommission the legacy system and promote the new implementation as the sole handler"
    ],
    "steps_zh": [
      "将遗留系统与新实现同时部署到同一生产环境",
      "将所有线上请求同时路由至两个系统，不向用户暴露新结果",
      "捕获并比对两系统的输出，记录所有差异",
      "排查并修复每一处差异，直到新系统输出与遗留系统一致",
      "下线遗留系统，将新实现晋升为唯一处理方"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Input",
      "Old Path",
      "New Path",
      "Compare"
    ],
    "viz_labels_zh": [
      "输入",
      "旧路径",
      "新路径",
      "结果对比"
    ],
    "related": [
      "strangler-fig-pattern",
      "canary-deployment",
      "blue-green-deployment"
    ],
    "tags": [
      "parallel-run",
      "migration",
      "comparison",
      "verification"
    ],
    "origin_author": "Michael Feathers, 2004",
    "origin_source": "Working Effectively with Legacy Code (Prentice Hall)",
    "origin_source_zh": "《修改代码的艺术》（Prentice Hall 出版）",
    "complexity": "advanced",
    "when_to_use": [
      "Migrating a financial calculation engine where correctness must be formally verified",
      "Replacing a fraud detection system where even small discrepancies have major consequences",
      "Switching to an AI/ML-based recommendation engine and validating it against rule-based logic",
      "Migrating critical data processing pipelines that must produce bit-identical results"
    ],
    "when_to_use_zh": [
      "迁移金融计算引擎，必须对正确性进行形式化验证",
      "替换欺诈检测系统，即使微小差异也会造成重大后果",
      "切换至基于 AI/ML 的推荐引擎，并与基于规则的逻辑进行对比验证",
      "迁移关键数据处理管线，要求产出完全一致的结果"
    ],
    "core_concepts": [
      "Dual execution: Every request is processed by both old and new systems simultaneously in production",
      "Shadow mode: The new system's output is captured but never shown to users during verification",
      "Output comparison: Automated diff tooling detects semantic or numeric discrepancies between the two outputs",
      "Discrepancy resolution: Every mismatch is investigated, categorized, and fixed before cutover",
      "Confidence threshold: A quantitative bar (e.g., 99.99% match rate) that must be met before decommissioning the old system"
    ],
    "core_concepts_zh": [
      "双重执行：每个请求在生产中同时由新旧两个系统处理",
      "影子模式：在验证期间捕获新系统的输出但不展示给用户",
      "输出比对：自动化差异工具检测两个输出之间的语义或数值差异",
      "差异消解：每一处不匹配都经过调查、分类和修复，然后才进行切换",
      "置信度阈值：在下线旧系统之前必须达到的量化标准（如 99.99% 匹配率）"
    ],
    "timeline": [
      [
        "2004",
        "Michael Feathers describes parallel testing techniques in 'Working Effectively with Legacy Code'"
      ],
      [
        "2013",
        "GitHub develops the Scientist library in Ruby to formalize parallel run experiments"
      ],
      [
        "2016",
        "GitHub open-sources Scientist, popularizing the pattern across the industry"
      ],
      [
        "2018",
        "Stripe adopts parallel runs for migrating their core payment processing engine"
      ],
      [
        "2023",
        "Pattern extended to AI model validation — running legacy rules alongside new LLM-based systems"
      ]
    ],
    "timeline_zh": [
      [
        "2004",
        "Michael Feathers 在《修改代码的艺术》中描述并行测试技术"
      ],
      [
        "2013",
        "GitHub 开发 Ruby 版 Scientist 库，将并行运行实验规范化"
      ],
      [
        "2016",
        "GitHub 开源 Scientist 库，推动该模式在行业内广泛传播"
      ],
      [
        "2018",
        "Stripe 在核心支付处理引擎迁移中采用并行运行模式"
      ],
      [
        "2023",
        "该模式扩展至 AI 模型验证——将遗留规则与新的 LLM 系统并行运行"
      ]
    ],
    "dos": [
      "Use a library like GitHub Scientist to standardize experiment setup, comparison, and reporting",
      "Log every discrepancy with full request context so root-cause analysis is efficient",
      "Set a clear quantitative threshold for cutover confidence before starting the parallel run",
      "Run the parallel comparison long enough to capture edge cases across business cycles"
    ],
    "dos_zh": [
      "使用 GitHub Scientist 等库来标准化实验设置、比对和报告",
      "记录每一处差异及完整的请求上下文，以便高效进行根因分析",
      "在开始并行运行之前，明确设定切换信心的量化阈值",
      "运行并行比对的时间足够长，以覆盖各业务周期中的边缘情况"
    ],
    "donts": [
      "Don't expose the new system's results to users until the match rate meets the confidence threshold",
      "Don't ignore low-frequency discrepancies — they often represent critical edge cases",
      "Don't underestimate the infrastructure cost of running two systems in parallel at production scale",
      "Don't skip performance testing — the new system may match functionally but degrade latency"
    ],
    "donts_zh": [
      "在匹配率达到置信阈值之前，不要将新系统的结果展示给用户",
      "不要忽略低频差异——它们往往代表关键的边缘情况",
      "不要低估在生产规模下并行运行两个系统的基础设施成本",
      "不要跳过性能测试——新系统功能匹配但可能导致延迟恶化"
    ],
    "case_study_company": "GitHub",
    "case_study": "GitHub created the Scientist library in 2013 to safely migrate their core merge algorithm. They ran the old and new merge code paths in parallel on every pull request merge, comparing results without exposing the new output to users. Over several months, they identified and fixed dozens of edge-case discrepancies, ultimately achieving a 100% match rate before switching over. The library was open-sourced in 2016 and has been adopted by hundreds of companies.",
    "case_study_zh": "GitHub 于 2013 年创建了 Scientist 库，以安全地迁移其核心合并算法。他们在每次拉取请求合并时并行运行新旧合并代码路径，在不向用户暴露新输出的情况下比对结果。经过数月时间，他们发现并修复了数十个边缘情况差异，最终在切换前达到了 100% 匹配率。该库于 2016 年开源，已被数百家公司采用。",
    "when_not_to_use": [
      "The system under migration has side effects (e.g., sends emails, charges credit cards) that cannot be safely duplicated",
      "Infrastructure budget cannot support double the compute and storage resources",
      "Output comparison is meaningless because the new system intentionally produces different results",
      "The migration scope is trivial and a simple A/B test or feature flag would suffice"
    ],
    "when_not_to_use_zh": [
      "被迁移系统存在副作用（如发送邮件、信用卡扣款）且无法安全地重复执行",
      "基础设施预算无法支撑双倍的计算和存储资源",
      "输出比对无意义，因为新系统有意产出不同的结果",
      "迁移范围很小，简单的 A/B 测试或功能开关即可满足需求"
    ],
    "adopters": [
      "GitHub",
      "Stripe",
      "Intercom",
      "Shopify",
      "SoundCloud"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Michael Feathers (2004). \"Working Effectively with Legacy Code\". Prentice Hall.",
    "secondary_sources": [
      "Sam Newman (2019). \"Monolith to Microservices: Evolutionary Patterns to Transform Your Monolith\". O'Reilly Media.",
      "Martin Fowler (2011). \"ParallelChange\". martinfowler.com."
    ],
    "typed_relations": [
      {
        "slug": "strangler-fig-pattern",
        "type": "complement"
      },
      {
        "slug": "canary-deployment",
        "type": "complement"
      },
      {
        "slug": "blue-green-deployment",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 76,
    "name": "Technical Debt Quadrant",
    "name_zh": "技术债务象限",
    "slug": "technical-debt-quadrant",
    "category": "evolution",
    "desc": "Classify tech debt by deliberate/inadvertent and reckless/prudent axes",
    "desc_zh": "按有意/无意与鲁莽/谨慎两轴对技术债务分类",
    "steps": [
      "Audit the codebase and collect all known shortcuts, workarounds, and design smells",
      "Classify each item on two axes: deliberate vs. inadvertent and reckless vs. prudent",
      "Prioritize reckless-inadvertent debt for immediate remediation as it carries highest risk",
      "Schedule prudent-deliberate debt paydown as part of regular sprint capacity allocation",
      "Document decisions to take on deliberate debt with explicit repayment plans and owners"
    ],
    "steps_zh": [
      "审计代码库，收集所有已知的快捷方式、变通方案和设计异味",
      "按两轴对每项进行分类：有意 vs. 无意，鲁莽 vs. 谨慎",
      "优先修复「鲁莽-无意」象限的债务，因其风险最高",
      "将「谨慎-有意」债务的偿还纳入常规迭代容量规划",
      "记录主动承担技术债务的决策，明确偿还计划与负责人"
    ],
    "ai_relevant": false,
    "viz_type": "quadrant",
    "viz_labels": [
      "Reckless",
      "Prudent",
      "Deliberate",
      "Inadvertent"
    ],
    "viz_labels_zh": [
      "鲁莽",
      "谨慎",
      "刻意",
      "无意"
    ],
    "related": [
      "mikado-method",
      "continuous-architecture",
      "architectural-fitness-functions"
    ],
    "tags": [
      "tech-debt",
      "classification",
      "prioritization",
      "risk-management"
    ],
    "origin_author": "Martin Fowler, 2009",
    "origin_source": "TechnicalDebtQuadrant (martinfowler.com)",
    "origin_source_zh": "《技术债务象限》（martinfowler.com 博文）",
    "complexity": "beginner",
    "when_to_use": [
      "A team needs a shared language to discuss and prioritize different kinds of technical debt",
      "Sprint retrospectives keep surfacing vague complaints about code quality without actionable categories",
      "Engineering leadership needs to communicate the business impact of technical debt to non-technical stakeholders",
      "Deciding whether to take on tactical shortcuts during a tight product deadline"
    ],
    "when_to_use_zh": [
      "团队需要统一语言来讨论和排列不同类型技术债务的优先级",
      "迭代回顾中反复出现关于代码质量的模糊抱怨，却缺乏可操作的分类",
      "工程领导层需要向非技术干系人阐述技术债务的业务影响",
      "在紧迫的产品交付期限内决定是否采取战术性快捷方式"
    ],
    "core_concepts": [
      "Deliberate vs. Inadvertent: Whether the team knowingly took on the debt or stumbled into it through lack of knowledge",
      "Reckless vs. Prudent: Whether the debt was taken carelessly or as a calculated trade-off with a repayment plan",
      "Reckless-Deliberate: 'We know this is wrong but ship it anyway' — highest urgency to fix",
      "Prudent-Inadvertent: 'Now we know how we should have done it' — natural learning, address when revisiting that area",
      "Interest payments: The ongoing cost of working around technical debt — slower delivery, more bugs, higher onboarding time"
    ],
    "core_concepts_zh": [
      "有意与无意：团队是明知故犯地承担债务，还是因知识不足而无意间产生的",
      "鲁莽与谨慎：债务是粗心大意造成的，还是经过权衡并附带偿还计划的",
      "鲁莽-有意：'我们知道这样做不对但还是发布了'——修复优先级最高",
      "谨慎-无意：'现在我们知道本应怎么做了'——自然的学习过程，在重访该区域时处理",
      "利息支出：绕过技术债务的持续成本——交付变慢、缺陷增多、新人上手时间延长"
    ],
    "timeline": [
      [
        "1992",
        "Ward Cunningham introduces the technical debt metaphor at OOPSLA"
      ],
      [
        "2003",
        "Martin Fowler begins writing extensively about technical debt on his blog"
      ],
      [
        "2009",
        "Fowler publishes the Technical Debt Quadrant, adding the two-axis classification"
      ],
      [
        "2014",
        "The quadrant becomes a standard tool in agile coaching and sprint planning"
      ],
      [
        "2021",
        "McKinsey and Gartner report that tech debt consumes 20-40% of IT budgets, validating the framework's relevance"
      ]
    ],
    "timeline_zh": [
      [
        "1992",
        "Ward Cunningham 在 OOPSLA 大会上引入技术债务隐喻"
      ],
      [
        "2003",
        "Martin Fowler 开始在博客上大量撰写有关技术债务的文章"
      ],
      [
        "2009",
        "Fowler 发表技术债务象限，增加了两轴分类法"
      ],
      [
        "2014",
        "该象限成为敏捷教练和迭代规划中的标准工具"
      ],
      [
        "2021",
        "麦肯锡和 Gartner 报告技术债务消耗了 20-40% 的 IT 预算，验证了该框架的现实意义"
      ]
    ],
    "dos": [
      "Use the quadrant in sprint retros to categorize newly discovered debt and agree on priority",
      "Track prudent-deliberate debt in a visible backlog with clear owners and target payoff dates",
      "Allocate a fixed percentage (e.g., 15-20%) of sprint capacity to debt paydown every iteration",
      "Connect debt items to business metrics (velocity, incident rate) to justify remediation investment"
    ],
    "dos_zh": [
      "在迭代回顾中使用象限对新发现的债务进行分类并达成优先级共识",
      "在可见的待办清单中追踪谨慎-有意债务，明确负责人和目标偿还日期",
      "每个迭代固定分配一定比例（如 15-20%）的容量用于偿还债务",
      "将债务项关联到业务指标（交付速度、故障率），以论证修复投入的合理性"
    ],
    "donts": [
      "Don't treat all technical debt as equally urgent — the quadrant exists precisely to differentiate",
      "Don't use 'technical debt' as a catch-all excuse without classifying the specific type and impact",
      "Don't ignore inadvertent debt — it signals gaps in team knowledge that need coaching or training",
      "Don't let prudent-deliberate debt accumulate without repayment — interest compounds over time"
    ],
    "donts_zh": [
      "不要将所有技术债务视为同等紧迫——象限的存在正是为了区分优先级",
      "不要将「技术债务」作为万能借口而不分类其具体类型和影响",
      "不要忽视无意债务——它预示着团队知识的缺口，需要辅导或培训",
      "不要让谨慎-有意债务无偿还计划地持续累积——利息会随时间复利增长"
    ],
    "case_study_company": "Spotify",
    "case_study": "Spotify adopted the Technical Debt Quadrant in their squad-based engineering culture to give autonomous teams a shared vocabulary for debt prioritization. Each squad categorized their debt using the quadrant during quarterly planning, and reckless-inadvertent items were flagged for immediate attention. This approach reduced their mean incident recovery time by 30% over two years, as the most dangerous hidden debt was systematically surfaced and addressed.",
    "case_study_zh": "Spotify 在其小队制工程文化中采用了技术债务象限，为自治团队提供统一的债务优先级词汇。每个小队在季度规划中使用象限对债务进行分类，鲁莽-无意类别的项目被标记为即刻关注。这一方法在两年内将平均故障恢复时间缩短了 30%，因为最危险的隐藏债务被系统性地识别和处理。",
    "when_not_to_use": [
      "The codebase is brand new with minimal accumulated shortcuts — classification overhead is unnecessary",
      "The team is in pure prototype/exploration mode where debt is expected and intentionally disposable",
      "A single monolithic debt item dominates everything — no need for a classification framework",
      "Stakeholders only care about debt quantity, not debt type (though this itself is a problem to fix)"
    ],
    "when_not_to_use_zh": [
      "代码库全新，几乎没有累积的快捷方式——分类开销不必要",
      "团队处于纯原型/探索模式，债务是预期内且有意一次性使用的",
      "单一的整体性债务项主导一切——不需要分类框架",
      "干系人只关心债务数量而非类型（尽管这本身是需要解决的问题）"
    ],
    "adopters": [
      "Spotify",
      "ThoughtWorks",
      "Atlassian",
      "Pivotal Labs",
      "Etsy"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Martin Fowler (2009). \"TechnicalDebtQuadrant\". martinfowler.com.",
    "secondary_sources": [
      "Ward Cunningham (1992). \"The WyCash Portfolio Management System\". OOPSLA 1992 Experience Report.",
      "Steve McConnell (2007). \"Technical Debt\". Construx Software Blog."
    ],
    "typed_relations": [
      {
        "slug": "mikado-method",
        "type": "complement"
      },
      {
        "slug": "continuous-architecture",
        "type": "complement"
      },
      {
        "slug": "architectural-fitness-functions",
        "type": "complement"
      }
    ]
  },
  {
    "id": 77,
    "name": "Architectural Fitness Functions",
    "name_zh": "架构适应度函数",
    "slug": "architectural-fitness-functions",
    "category": "evolution",
    "desc": "Automated tests that continuously verify architectural characteristics stay intact",
    "desc_zh": "通过自动化测试持续验证架构特性保持完整",
    "steps": [
      "Identify the critical architectural characteristics to protect (coupling, latency, security)",
      "Define measurable thresholds for each characteristic (e.g., max cyclic dependencies = 0)",
      "Implement automated tests or tools (ArchUnit, chaos experiments, load tests) per characteristic",
      "Integrate fitness function checks into the CI/CD pipeline as mandatory quality gates",
      "Review and evolve the fitness function suite as architectural priorities shift over time"
    ],
    "steps_zh": [
      "确定需要保护的关键架构特性（耦合度、延迟、安全性等）",
      "为每项特性定义可量化阈值（如：最大循环依赖数 = 0）",
      "针对每项特性实现自动化测试或工具（ArchUnit、混沌实验、负载测试）",
      "将适应度函数检查集成到 CI/CD 流水线作为强制质量门禁",
      "随架构优先级的变化，持续审视和演进适应度函数套件"
    ],
    "ai_relevant": true,
    "viz_type": "radar",
    "viz_labels": [
      "Scalability",
      "Security",
      "Reliability",
      "Performance",
      "Maintainability"
    ],
    "viz_labels_zh": [
      "可扩展性",
      "安全性",
      "可靠性",
      "性能",
      "可维护性"
    ],
    "related": [
      "continuous-architecture",
      "technical-debt-quadrant",
      "test-pyramid"
    ],
    "tags": [
      "fitness-functions",
      "automated-testing",
      "architecture-governance",
      "ci-cd"
    ],
    "origin_author": "Neal Ford, Rebecca Parsons & Patrick Kua, 2017",
    "origin_source": "Building Evolutionary Architectures (O'Reilly)",
    "origin_source_zh": "《演进式架构》（O'Reilly 出版）",
    "complexity": "advanced",
    "when_to_use": [
      "Protecting performance SLAs as a growing team continuously ships code changes",
      "Enforcing module boundaries in a monorepo to prevent creeping coupling between domains",
      "Validating security compliance rules (e.g., no direct DB access from the API layer) automatically in CI",
      "Ensuring that migration to microservices doesn't inadvertently introduce circular dependencies"
    ],
    "when_to_use_zh": [
      "在团队持续交付代码变更时保护性能 SLA",
      "在单仓库中强制模块边界，防止领域间的耦合蔓延",
      "在 CI 中自动验证安全合规规则（如禁止 API 层直接访问数据库）",
      "确保向微服务迁移的过程中不会无意引入循环依赖"
    ],
    "core_concepts": [
      "Fitness function: An objective function that assesses how well the architecture exhibits a desired characteristic",
      "Architectural characteristic: A quality attribute (performance, modularity, security) the system must maintain",
      "Atomic fitness function: Tests a single characteristic in isolation (e.g., no cyclic dependencies)",
      "Holistic fitness function: Tests emergent behavior across multiple characteristics (e.g., chaos experiments)",
      "Continuous verification: Fitness functions run automatically in CI/CD, catching architectural drift before it reaches production"
    ],
    "core_concepts_zh": [
      "适应度函数：评估架构在某一期望特性上表现如何的目标函数",
      "架构特性：系统必须维持的质量属性（性能、模块化、安全性）",
      "原子适应度函数：独立测试单一特性（如无循环依赖）",
      "整体适应度函数：测试跨多个特性的涌现行为（如混沌实验）",
      "持续验证：适应度函数在 CI/CD 中自动运行，在架构漂移到达生产之前予以捕获"
    ],
    "timeline": [
      [
        "2017",
        "Ford, Parsons & Kua introduce fitness functions in 'Building Evolutionary Architectures'"
      ],
      [
        "2018",
        "ArchUnit library gains popularity for implementing dependency fitness functions in Java"
      ],
      [
        "2019",
        "Netflix integrates fitness functions into their chaos engineering and deployment pipelines"
      ],
      [
        "2021",
        "The concept is extended to data architecture with data quality fitness functions"
      ],
      [
        "2024",
        "AI-powered fitness functions emerge, using LLMs to evaluate code quality and architectural conformance"
      ]
    ],
    "timeline_zh": [
      [
        "2017",
        "Ford、Parsons 和 Kua 在《演进式架构》中引入适应度函数概念"
      ],
      [
        "2018",
        "ArchUnit 库在 Java 领域流行，用于实现依赖关系适应度函数"
      ],
      [
        "2019",
        "Netflix 将适应度函数集成到其混沌工程和部署流水线中"
      ],
      [
        "2021",
        "概念扩展至数据架构领域，出现数据质量适应度函数"
      ],
      [
        "2024",
        "AI 驱动的适应度函数出现，使用 LLM 评估代码质量和架构符合性"
      ]
    ],
    "dos": [
      "Start with the most critical characteristic first — typically the one causing production incidents",
      "Make fitness functions fast enough to run on every commit, not just nightly builds",
      "Version your fitness function thresholds alongside the code so they evolve together",
      "Use ArchUnit or similar tools for structural checks and chaos tests for runtime characteristics"
    ],
    "dos_zh": [
      "从最关键的特性开始——通常是导致生产事故的那个",
      "确保适应度函数运行速度足够快，能在每次提交时执行，而非仅在夜间构建",
      "将适应度函数的阈值与代码一起版本管理，使其协同演进",
      "使用 ArchUnit 等工具进行结构检查，使用混沌测试检验运行时特性"
    ],
    "donts": [
      "Don't create fitness functions for every possible characteristic — focus on the ones with real risk",
      "Don't set thresholds too loosely or too tightly — calibrate against current baseline data",
      "Don't treat fitness functions as static — they must evolve as the architecture's priorities change",
      "Don't rely only on compile-time checks — runtime characteristics like latency need production-like tests"
    ],
    "donts_zh": [
      "不要为每个可能的特性都创建适应度函数——聚焦于真正有风险的特性",
      "阈值不要设得太松或太紧——根据当前基线数据进行校准",
      "不要将适应度函数视为静态的——它们必须随着架构优先级的变化而演进",
      "不要仅依赖编译时检查——延迟等运行时特性需要类生产环境的测试"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix integrated architectural fitness functions into their deployment pipeline to safeguard their microservices architecture. They combined ArchUnit-style dependency checks with chaos engineering experiments (Chaos Monkey, Chaos Kong) to continuously validate both structural and runtime characteristics. This approach caught architectural drift early — for example, detecting when a new service inadvertently created a synchronous dependency chain that violated their latency budget.",
    "case_study_zh": "Netflix 将架构适应度函数集成到部署流水线中，以保护其微服务架构。他们将 ArchUnit 风格的依赖检查与混沌工程实验（Chaos Monkey、Chaos Kong）相结合，持续验证结构和运行时特性。这一方法及早捕获了架构漂移——例如，检测到某个新服务无意中创建了违反延迟预算的同步依赖链。",
    "when_not_to_use": [
      "A small startup with a simple codebase where manual code review is sufficient",
      "Purely experimental or throwaway prototypes where architectural integrity is not a priority",
      "The team lacks CI/CD infrastructure to run automated checks consistently",
      "Architecture is frozen and no new changes are planned — fitness functions protect evolving systems"
    ],
    "when_not_to_use_zh": [
      "小型初创公司代码库简单，人工代码评审即可满足需求",
      "纯实验性或一次性原型，架构完整性不是优先事项",
      "团队缺乏 CI/CD 基础设施来一致地运行自动化检查",
      "架构已冻结且无新变更计划——适应度函数保护的是演进中的系统"
    ],
    "adopters": [
      "Netflix",
      "ThoughtWorks",
      "Zalando",
      "Intuit",
      "N26"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Neal Ford, Rebecca Parsons, and Patrick Kua (2017). \"Building Evolutionary Architectures: Support Constant Change\". O'Reilly Media.",
    "secondary_sources": [
      "Neal Ford, Rebecca Parsons, and Patrick Kua (2023). \"Building Evolutionary Architectures, 2nd Edition\". O'Reilly Media.",
      "ThoughtWorks (2017). \"Fitness Functions\". thoughtworks.com/radar/techniques."
    ],
    "typed_relations": [
      {
        "slug": "continuous-architecture",
        "type": "complement"
      },
      {
        "slug": "technical-debt-quadrant",
        "type": "complement"
      },
      {
        "slug": "test-pyramid",
        "type": "complement"
      }
    ]
  },
  {
    "id": 78,
    "name": "Conway's Law",
    "name_zh": "康威定律",
    "slug": "conways-law",
    "category": "evolution",
    "desc": "Systems mirror the communication structure of the organizations that build them",
    "desc_zh": "系统的结构反映构建它的组织的沟通结构",
    "steps": [
      "Map current team communication patterns and organizational boundaries explicitly",
      "Audit the existing system architecture and identify how it mirrors team boundaries",
      "Detect misalignments where architecture crosses team lines, causing friction and bottlenecks",
      "Use the law prescriptively: redesign team structures to match the desired target architecture",
      "Iterate on both org design and system design together, treating them as co-evolving artifacts"
    ],
    "steps_zh": [
      "明确梳理当前团队沟通模式和组织边界",
      "审计现有系统架构，识别其如何映射团队边界",
      "检测架构跨越团队边界导致摩擦和瓶颈的错位之处",
      "将该定律用于规范性指导：重新设计团队结构以匹配目标架构",
      "将组织设计与系统设计作为协同演进的制品，一并持续迭代"
    ],
    "ai_relevant": false,
    "viz_type": "venn",
    "viz_labels": [
      "Team Structure",
      "System Design"
    ],
    "viz_labels_zh": [
      "团队结构",
      "系统设计"
    ],
    "related": [
      "inverse-conway-maneuver",
      "team-topologies",
      "microservices-decomposition"
    ],
    "tags": [
      "conways-law",
      "org-structure",
      "architecture-alignment",
      "communication"
    ],
    "origin_author": "Melvin Conway, 1967",
    "origin_source": "How Do Committees Invent? (Datamation, April 1968)",
    "origin_source_zh": "《委员会如何发明？》（Datamation 杂志，1968年4月）",
    "complexity": "beginner",
    "when_to_use": [
      "Diagnosing why a microservices architecture ended up as a distributed monolith",
      "Planning an organizational restructuring in tandem with a system redesign",
      "Explaining to leadership why cross-team dependencies are slowing delivery velocity",
      "Evaluating whether to split a monolith — the answer depends on whether teams can be split too"
    ],
    "when_to_use_zh": [
      "诊断为何微服务架构最终演变成分布式单体",
      "规划与系统重设计同步进行的组织重组",
      "向领导层解释为何跨团队依赖在拖慢交付速度",
      "评估是否拆分单体——答案取决于团队是否也能同步拆分"
    ],
    "core_concepts": [
      "Organizational isomorphism: Software architecture inevitably mirrors the organization's communication graph",
      "Communication paths: Module boundaries form along the lines of least communication resistance between teams",
      "Prescriptive application: Instead of observing the law passively, use it actively to shape team structures for desired architecture",
      "Sociotechnical coupling: Technical decisions and organizational decisions are inseparable — changing one without the other creates friction",
      "Coordination cost: Cross-team interfaces tend to become the system's most brittle integration points"
    ],
    "core_concepts_zh": [
      "组织同构：软件架构不可避免地映射组织的沟通图谱",
      "沟通路径：模块边界沿团队间沟通阻力最小的路径形成",
      "规范性应用：不是被动观察该定律，而是主动利用它来塑造团队结构以获得期望的架构",
      "社会技术耦合：技术决策与组织决策不可分割——只改其一而不改另一会产生摩擦",
      "协调成本：跨团队接口往往成为系统中最脆弱的集成点"
    ],
    "timeline": [
      [
        "1967",
        "Melvin Conway submits 'How Do Committees Invent?' (published in Datamation, 1968)"
      ],
      [
        "1975",
        "Fred Brooks references Conway's insight in 'The Mythical Man-Month'"
      ],
      [
        "2010",
        "The 'Inverse Conway Maneuver' is coined by Jonny LeRoy and Matt Simons at ThoughtWorks"
      ],
      [
        "2015",
        "Microservices movement explicitly leverages Conway's Law for service boundary design"
      ],
      [
        "2019",
        "Team Topologies codifies Conway's Law into four team types and three interaction modes"
      ]
    ],
    "timeline_zh": [
      [
        "1967",
        "Melvin Conway 提交《委员会如何发明？》论文（1968年在 Datamation 发表）"
      ],
      [
        "1975",
        "Fred Brooks 在《人月神话》中引用康威的洞见"
      ],
      [
        "2010",
        "Jonny LeRoy 和 Matt Simons 在 ThoughtWorks 提出「逆康威策略」"
      ],
      [
        "2015",
        "微服务运动明确利用康威定律进行服务边界设计"
      ],
      [
        "2019",
        "《团队拓扑》将康威定律编纂为四种团队类型和三种交互模式"
      ]
    ],
    "dos": [
      "Map your organization chart and your system architecture side-by-side to reveal hidden coupling",
      "Align team ownership with service/module boundaries for faster, more autonomous delivery",
      "Use Conway's Law diagnostically first before attempting prescriptive reorganization",
      "Educate non-engineering leadership on the law so they understand the architectural impact of org changes"
    ],
    "dos_zh": [
      "将组织结构图与系统架构图并排对比，揭示隐藏的耦合",
      "将团队所有权与服务/模块边界对齐，以实现更快更自治的交付",
      "在尝试规范性重组之前，先诊断性地运用康威定律",
      "向非工程领导层普及该定律，使其理解组织变动对架构的影响"
    ],
    "donts": [
      "Don't reorganize teams without simultaneously considering the impact on software architecture",
      "Don't ignore Conway's Law and assume you can build any architecture with any org structure",
      "Don't apply it mechanically — team talent, culture, and domain complexity also matter",
      "Don't use it to justify architectural stagnation — 'our org can't support microservices' is often a solvable problem"
    ],
    "donts_zh": [
      "不要在不考虑对软件架构影响的情况下重组团队",
      "不要无视康威定律，假设任何组织结构都能构建出任何架构",
      "不要机械地套用——团队能力、文化和领域复杂度同样重要",
      "不要用它来为架构停滞辩护——'我们的组织不支持微服务'往往是可以解决的问题"
    ],
    "case_study_company": "Microsoft",
    "case_study": "A 2008 study by Microsoft Research (Nagappan, Murphy & Basili) empirically validated Conway's Law by analyzing Windows Vista. They found that organizational metrics — such as the number of engineers touching a module and the organizational distance between them — were stronger predictors of software defects than code metrics like complexity or coverage. Modules owned by distributed, loosely coordinated teams had significantly higher defect rates.",
    "case_study_zh": "微软研究院在 2008 年的一项研究（Nagappan、Murphy 和 Basili）通过分析 Windows Vista 实证验证了康威定律。他们发现，组织指标——如接触某模块的工程师数量及其组织距离——比代码复杂度或覆盖率等代码指标更能预测软件缺陷。由分布式、松散协调团队拥有的模块缺陷率显著更高。",
    "when_not_to_use": [
      "Very small teams (fewer than 8 people) where everyone communicates with everyone directly",
      "Temporary hackathon or prototype projects where organizational structure is intentionally fluid",
      "Outsourced or contract development where you have no control over team organization",
      "Highly regulated industries where architecture must follow compliance mandates regardless of org structure"
    ],
    "when_not_to_use_zh": [
      "非常小的团队（少于 8 人），每个人都能直接与所有人沟通",
      "临时的黑客马拉松或原型项目，组织结构有意保持流动",
      "外包或合同开发，无法控制团队组织方式",
      "高度监管的行业，架构必须遵循合规要求而非组织结构"
    ],
    "adopters": [
      "Microsoft",
      "Amazon",
      "Spotify",
      "Netflix",
      "ThoughtWorks"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Melvin Conway (1968). \"How Do Committees Invent?\". Datamation, 14(4).",
    "secondary_sources": [
      "Fred Brooks (1975). \"The Mythical Man-Month: Essays on Software Engineering\". Addison-Wesley.",
      "Matthew Skelton and Manuel Pais (2019). \"Team Topologies: Organizing Business and Technology Teams for Fast Flow\". IT Revolution Press."
    ],
    "typed_relations": [
      {
        "slug": "inverse-conway-maneuver",
        "type": "complement"
      },
      {
        "slug": "team-topologies",
        "type": "complement"
      },
      {
        "slug": "microservices-decomposition",
        "type": "related"
      }
    ]
  },
  {
    "id": 79,
    "name": "Inverse Conway Maneuver",
    "name_zh": "逆康威策略",
    "slug": "inverse-conway-maneuver",
    "category": "evolution",
    "desc": "Deliberately restructure teams to produce the desired system architecture",
    "desc_zh": "主动重组团队结构，以引导产出期望的系统架构",
    "steps": [
      "Define the target software architecture (e.g., bounded microservices, platform layers)",
      "Model teams around the desired architecture boundaries before writing any new code",
      "Establish clear team APIs: explicit ownership, stable interfaces, and interaction modes",
      "Spin up new teams or reorganize existing ones to align with the architecture blueprint",
      "Monitor whether the emerging code structure matches the intended architecture over 2-3 quarters"
    ],
    "steps_zh": [
      "定义目标软件架构（如：有界微服务、平台层次）",
      "在编写任何新代码之前，围绕期望的架构边界进行团队建模",
      "建立清晰的团队 API：明确所有权、稳定接口和交互模式",
      "新建团队或重组现有团队，使其与架构蓝图对齐",
      "在 2-3 个季度内监控涌现的代码结构是否符合预期架构"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Target Arch",
      "Org Design",
      "Team Align",
      "System"
    ],
    "viz_labels_zh": [
      "目标架构",
      "组织设计",
      "团队对齐",
      "系统演进"
    ],
    "related": [
      "conways-law",
      "team-topologies",
      "microservices-decomposition"
    ],
    "tags": [
      "inverse-conway",
      "team-design",
      "org-restructuring",
      "architecture"
    ],
    "origin_author": "Jonny LeRoy & Matt Simons, 2010",
    "origin_source": "Inverse Conway Maneuver (ThoughtWorks Technology Radar)",
    "origin_source_zh": "《逆康威策略》（ThoughtWorks 技术雷达）",
    "complexity": "advanced",
    "when_to_use": [
      "Transitioning from a monolith to microservices and needing teams aligned to service boundaries",
      "Building a new platform layer and forming a dedicated platform team before writing platform code",
      "Merging two acquired companies and designing a unified architecture through deliberate team composition",
      "Preparing for a major scale-up where the current org structure will produce the wrong architecture"
    ],
    "when_to_use_zh": [
      "从单体向微服务转型，需要团队与服务边界对齐",
      "构建新的平台层，在编写平台代码之前组建专属平台团队",
      "合并两家被收购公司，通过有意的团队组合设计统一架构",
      "为重大规模扩展做准备，当前组织结构会产生错误的架构"
    ],
    "core_concepts": [
      "Prescriptive org design: Intentionally shape team structure to produce the desired software architecture",
      "Team-first architecture: Design teams before designing systems — the org chart is the first architecture diagram",
      "Bounded ownership: Each team owns a well-defined architectural component with clear API boundaries",
      "Architecture blueprint: The target architecture serves as the template for team structure, not the other way around",
      "Feedback loop: Continuously verify that the code structure emerging from team interactions matches the target architecture"
    ],
    "core_concepts_zh": [
      "规范性组织设计：有意塑造团队结构以产出期望的软件架构",
      "团队优先架构：先设计团队，再设计系统——组织结构图是第一张架构图",
      "有界所有权：每个团队拥有定义明确、具有清晰 API 边界的架构组件",
      "架构蓝图：目标架构作为团队结构的模板，而非反过来",
      "反馈循环：持续验证团队交互中涌现的代码结构是否与目标架构匹配"
    ],
    "timeline": [
      [
        "2010",
        "Jonny LeRoy and Matt Simons coin 'Inverse Conway Maneuver' at ThoughtWorks"
      ],
      [
        "2014",
        "ThoughtWorks Technology Radar features the maneuver as a recommended technique"
      ],
      [
        "2015",
        "Sam Newman discusses it extensively in the context of microservice team design"
      ],
      [
        "2019",
        "Team Topologies formalizes the maneuver with four team types and three interaction modes"
      ],
      [
        "2022",
        "Large enterprises (ING, Adidas) report successful large-scale application of the maneuver"
      ]
    ],
    "timeline_zh": [
      [
        "2010",
        "Jonny LeRoy 和 Matt Simons 在 ThoughtWorks 提出「逆康威策略」"
      ],
      [
        "2014",
        "ThoughtWorks 技术雷达将该策略作为推荐技术加以收录"
      ],
      [
        "2015",
        "Sam Newman 在微服务团队设计的背景下详细讨论该策略"
      ],
      [
        "2019",
        "《团队拓扑》通过四种团队类型和三种交互模式将该策略形式化"
      ],
      [
        "2022",
        "大型企业（ING、Adidas）报告了该策略的大规模成功应用"
      ]
    ],
    "dos": [
      "Start with a clear target architecture diagram before touching the org chart",
      "Get executive sponsorship — org restructuring requires top-down support to succeed",
      "Pair the maneuver with Team Topologies to provide a concrete vocabulary for team types and interactions",
      "Monitor team health and delivery metrics for 2-3 quarters after restructuring to verify results"
    ],
    "dos_zh": [
      "在调整组织结构之前，先确定清晰的目标架构图",
      "获得高管支持——组织重组需要自上而下的支持才能成功",
      "将该策略与团队拓扑结合使用，为团队类型和交互提供具体词汇",
      "重组后连续 2-3 个季度监控团队健康度和交付指标以验证效果"
    ],
    "donts": [
      "Don't restructure teams without a clearly defined target architecture — you'll just create different chaos",
      "Don't ignore the human impact — frequent reorgs destroy morale if not handled with empathy and clarity",
      "Don't expect instant results — it takes 2-3 quarters for the new structure to produce architectural change",
      "Don't forget to also change tooling and processes — team structure alone won't fix misaligned CI/CD pipelines"
    ],
    "donts_zh": [
      "不要在没有明确目标架构的情况下重组团队——否则只会制造不同的混乱",
      "不要忽视人的因素——频繁重组如果不以同理心和透明度处理会摧毁士气",
      "不要期望立竿见影——新结构需要 2-3 个季度才能产生架构变化",
      "不要忘记同时变更工具和流程——仅靠团队结构无法修复不对齐的 CI/CD 流水线"
    ],
    "case_study_company": "ING Bank",
    "case_study": "ING Bank Netherlands applied the Inverse Conway Maneuver in 2015 when they restructured their IT organization from project-based teams into autonomous squads and tribes aligned with customer journeys. Each squad owned a specific microservice domain end-to-end. Within two years, their deployment frequency increased from quarterly releases to multiple deployments per day, and their architecture naturally evolved from a tangled monolith into well-bounded services.",
    "case_study_zh": "荷兰 ING 银行在 2015 年应用了逆康威策略，将 IT 组织从基于项目的团队重组为与客户旅程对齐的自治小队和部落。每个小队端到端地拥有特定的微服务领域。两年内，他们的部署频率从季度发布提升到每天多次部署，架构自然从纠缠的单体演进为边界清晰的服务。",
    "when_not_to_use": [
      "The organization is too small for team structure to meaningfully constrain architecture",
      "Leadership is unwilling to make organizational changes — the maneuver requires real restructuring",
      "The target architecture is not yet well understood — restructuring teams around a vague blueprint is wasteful",
      "The existing architecture is working well and there is no strategic need to change it"
    ],
    "when_not_to_use_zh": [
      "组织太小，团队结构不会对架构产生有意义的约束",
      "领导层不愿做出组织变革——该策略需要真正的重组",
      "目标架构尚未被充分理解——围绕模糊蓝图重组团队是浪费",
      "现有架构运行良好，没有战略性变更需求"
    ],
    "adopters": [
      "ING Bank",
      "Adidas",
      "Spotify",
      "Zalando",
      "Sky UK"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability",
      "scalability"
    ],
    "maturity_ring": "established",
    "primary_source": "Jonny LeRoy and Matt Simons (2010). \"Inverse Conway Maneuver\". ThoughtWorks Technology Radar.",
    "secondary_sources": [
      "Matthew Skelton and Manuel Pais (2019). \"Team Topologies: Organizing Business and Technology Teams for Fast Flow\". IT Revolution Press.",
      "James Lewis and Martin Fowler (2014). \"Microservices: A Definition of This New Architectural Term\". martinfowler.com."
    ],
    "typed_relations": [
      {
        "slug": "conways-law",
        "type": "extends"
      },
      {
        "slug": "team-topologies",
        "type": "complement"
      },
      {
        "slug": "microservices-decomposition",
        "type": "complement"
      }
    ]
  },
  {
    "id": 80,
    "name": "Team Topologies",
    "name_zh": "团队拓扑",
    "slug": "team-topologies",
    "category": "evolution",
    "desc": "Four team types and three interaction modes for fast, sustainable software delivery",
    "desc_zh": "通过四种团队类型与三种交互模式实现快速可持续的软件交付",
    "steps": [
      "Classify all existing teams into the four types: Stream-aligned, Platform, Enabling, Complicated-subsystem",
      "Identify current interaction modes: Collaboration, X-as-a-Service, or Facilitating",
      "Detect anti-patterns such as overloaded stream-aligned teams or absent platform teams",
      "Redesign team interactions to minimize cognitive load and maximize flow for stream-aligned teams",
      "Set trigger conditions (team size, cognitive load signals) for evolving team topology over time"
    ],
    "steps_zh": [
      "将现有团队分类为四种类型：流式对齐团队、平台团队、赋能团队、复杂子系统团队",
      "识别当前交互模式：协作、X即服务或促进",
      "检测反模式，如过载的流式对齐团队或缺失的平台团队",
      "重新设计团队交互方式，最小化认知负荷，最大化流式对齐团队的交付流",
      "设定触发条件（团队规模、认知负荷信号），驱动团队拓扑随时间持续演进"
    ],
    "ai_relevant": false,
    "viz_type": "tree",
    "viz_labels": [
      "Stream-Aligned",
      "Platform",
      "Enabling",
      "Subsystem"
    ],
    "viz_labels_zh": [
      "流动对齐团队",
      "平台团队",
      "赋能团队",
      "复杂子系统"
    ],
    "related": [
      "conways-law",
      "inverse-conway-maneuver",
      "calms-framework",
      "spotify-model",
      "amazon-two-pizza-teams"
    ],
    "tags": [
      "team-topologies",
      "cognitive-load",
      "org-design",
      "stream-aligned"
    ],
    "origin_author": "Matthew Skelton & Manuel Pais, 2019",
    "origin_source": "Team Topologies: Organizing Business and Technology Teams for Fast Flow (IT Revolution Press)",
    "origin_source_zh": "《团队拓扑：以业务与技术团队组织实现快速流动》（IT Revolution 出版）",
    "complexity": "intermediate",
    "when_to_use": [
      "Scaling engineering from 3-5 teams to 10+ teams and needing a coherent organizational model",
      "Reducing cross-team dependencies that are slowing down delivery across multiple product streams",
      "Designing a platform engineering organization to support self-service developer experience",
      "Diagnosing cognitive overload in teams that own too many domains or too much infrastructure"
    ],
    "when_to_use_zh": [
      "工程团队从 3-5 个扩展到 10+ 个，需要连贯的组织模型",
      "减少跨团队依赖，这些依赖正在拖慢多个产品流的交付",
      "设计平台工程组织以支撑自助式开发者体验",
      "诊断因拥有过多领域或过多基础设施而导致的团队认知过载"
    ],
    "core_concepts": [
      "Stream-aligned team: The primary team type, organized around a flow of work from a business domain or user journey",
      "Platform team: Provides self-service internal capabilities that reduce cognitive load for stream-aligned teams",
      "Enabling team: Helps stream-aligned teams adopt new technologies or practices, then steps back",
      "Complicated-subsystem team: Owns components requiring deep specialist knowledge that would overload a stream-aligned team",
      "Cognitive load: The core constraint — team topology must be designed to keep each team's cognitive load manageable"
    ],
    "core_concepts_zh": [
      "流式对齐团队：主要团队类型，围绕业务领域或用户旅程的工作流组织",
      "平台团队：提供自助式内部能力，减轻流式对齐团队的认知负荷",
      "赋能团队：帮助流式对齐团队采用新技术或实践，然后退出",
      "复杂子系统团队：拥有需要深度专业知识的组件，这些知识会过载流式对齐团队",
      "认知负荷：核心约束——团队拓扑必须设计为使每个团队的认知负荷保持在可控范围"
    ],
    "timeline": [
      [
        "2013",
        "Skelton and Pais begin writing about DevOps team patterns on their blog"
      ],
      [
        "2016",
        "The term 'Team Topologies' appears in conference talks and early publications"
      ],
      [
        "2019",
        "The book 'Team Topologies' is published by IT Revolution Press and becomes an instant bestseller"
      ],
      [
        "2021",
        "Team Topologies Academy launches, offering certified training and workshops"
      ],
      [
        "2023",
        "The framework is widely adopted by enterprises worldwide, with translations into 10+ languages"
      ]
    ],
    "timeline_zh": [
      [
        "2013",
        "Skelton 和 Pais 开始在博客上撰写关于 DevOps 团队模式的文章"
      ],
      [
        "2016",
        "「团队拓扑」一词出现在会议演讲和早期出版物中"
      ],
      [
        "2019",
        "《团队拓扑》由 IT Revolution 出版，迅速成为畅销书"
      ],
      [
        "2021",
        "团队拓扑学院成立，提供认证培训和工作坊"
      ],
      [
        "2023",
        "该框架被全球企业广泛采用，已翻译成 10 多种语言"
      ]
    ],
    "dos": [
      "Start by assessing each team's cognitive load — this is the fundamental constraint to optimize for",
      "Ensure most teams are stream-aligned — platform and enabling teams exist to support them, not the reverse",
      "Define explicit interaction modes between every pair of teams and review them quarterly",
      "Use team topology evolution triggers (growing team size, rising lead time) to know when to restructure"
    ],
    "dos_zh": [
      "从评估每个团队的认知负荷开始——这是需要优化的根本约束",
      "确保大多数团队是流式对齐的——平台和赋能团队的存在是为了支持它们，而非反过来",
      "为每对团队之间定义明确的交互模式，并每季度审查",
      "使用团队拓扑演进触发条件（团队规模增长、前置时间上升）来判断何时重组"
    ],
    "donts": [
      "Don't create platform teams too early — wait until the need is clear from cognitive load signals",
      "Don't let enabling teams become permanent dependencies — they should upskill and move on",
      "Don't force-fit every team into one of four types — some teams may be transitioning between types",
      "Don't apply the framework without executive buy-in — team topology changes require organizational authority"
    ],
    "donts_zh": [
      "不要过早创建平台团队——等到认知负荷信号明确显示需要时再行动",
      "不要让赋能团队成为永久依赖——他们应该赋能后撤出",
      "不要强行将每个团队归入四种类型之一——有些团队可能正在类型之间过渡",
      "不要在没有高管支持的情况下应用该框架——团队拓扑变更需要组织层面的授权"
    ],
    "case_study_company": "Adidas",
    "case_study": "Adidas adopted Team Topologies to restructure their engineering organization during their digital transformation in 2020. They reorganized over 30 teams into stream-aligned squads mapped to customer journeys (browse, purchase, returns), supported by a platform team providing a shared Kubernetes-based developer platform. The restructuring reduced their average lead time for changes from weeks to days and eliminated most cross-team handoffs that had been blocking releases.",
    "case_study_zh": "Adidas 在 2020 年数字化转型期间采用团队拓扑重组其工程组织。他们将 30 多个团队重组为映射到客户旅程（浏览、购买、退货）的流式对齐小队，并由一个提供共享 Kubernetes 开发者平台的平台团队提供支撑。重组将变更的平均前置时间从数周缩短到数天，并消除了大部分阻碍发布的跨团队交接。",
    "when_not_to_use": [
      "Organizations with fewer than 3 teams — the overhead of formal topology classification is not justified",
      "Teams that are fully cross-functional and self-sufficient with no cognitive load issues",
      "Short-term project-based work where teams disband after delivery",
      "The organizational culture resists any form of team classification or prescribed interaction modes"
    ],
    "when_not_to_use_zh": [
      "少于 3 个团队的组织——正式拓扑分类的开销不合理",
      "团队已完全跨职能且自给自足，没有认知负荷问题",
      "短期项目制工作，交付后团队即解散",
      "组织文化抵制任何形式的团队分类或规定的交互模式"
    ],
    "adopters": [
      "Adidas",
      "PureGym",
      "uSwitch",
      "Nationwide Building Society",
      "Condé Nast"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability",
      "scalability"
    ],
    "maturity_ring": "established",
    "primary_source": "Matthew Skelton and Manuel Pais (2019). \"Team Topologies: Organizing Business and Technology Teams for Fast Flow\". IT Revolution Press.",
    "secondary_sources": [
      "Melvin Conway (1968). \"How Do Committees Invent?\". Datamation, 14(4).",
      "Nicole Forsgren, Jez Humble, and Gene Kim (2018). \"Accelerate: The Science of Lean Software and DevOps\". IT Revolution Press."
    ],
    "typed_relations": [
      {
        "slug": "conways-law",
        "type": "complement"
      },
      {
        "slug": "inverse-conway-maneuver",
        "type": "complement"
      },
      {
        "slug": "calms-framework",
        "type": "complement"
      },
      {
        "slug": "spotify-model",
        "type": "alternative"
      },
      {
        "slug": "amazon-two-pizza-teams",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 81,
    "name": "Database Migration Patterns",
    "name_zh": "数据库迁移模式",
    "slug": "database-migration-patterns",
    "category": "evolution",
    "desc": "Safe techniques for evolving database schemas without downtime or data loss",
    "desc_zh": "无停机、无数据丢失地演进数据库模式的安全技术集合",
    "steps": [
      "Version-control all schema changes as incremental migration scripts (Flyway, Liquibase)",
      "Apply expand-contract: add new columns/tables first, migrate data, then remove old structures",
      "Use backward-compatible migrations so old and new application code can run simultaneously",
      "Test migrations in a production-equivalent staging environment before applying to production",
      "Automate rollback scripts for every migration and validate them in CI before merging"
    ],
    "steps_zh": [
      "将所有模式变更以增量迁移脚本形式纳入版本控制（Flyway、Liquibase）",
      "应用扩展-收缩模式：先新增列/表，迁移数据，再删除旧结构",
      "保持向后兼容的迁移，使新旧应用代码可同时运行",
      "在等同于生产环境的预发布环境中测试迁移，再应用至生产",
      "为每次迁移自动化回滚脚本，并在合并前通过 CI 验证"
    ],
    "ai_relevant": false,
    "viz_type": "timeline",
    "viz_labels": [
      "Schema v1",
      "Expand",
      "Migrate Data",
      "Contract"
    ],
    "viz_labels_zh": [
      "初始模式",
      "扩展字段",
      "迁移数据",
      "收缩旧字段"
    ],
    "related": [
      "strangler-fig-pattern",
      "branch-by-abstraction",
      "api-versioning-strategies"
    ],
    "tags": [
      "database",
      "migration",
      "schema-evolution",
      "zero-downtime"
    ],
    "origin_author": "Pramod Sadalage & Scott Ambler, 2006",
    "origin_source": "Refactoring Databases: Evolutionary Database Design (Addison-Wesley)",
    "origin_source_zh": "《数据库重构：演进式数据库设计》（Addison-Wesley 出版）",
    "complexity": "intermediate",
    "when_to_use": [
      "Deploying schema changes in a zero-downtime production environment with rolling deployments",
      "Splitting a shared database as part of a monolith-to-microservices decomposition",
      "Migrating from one database vendor to another while keeping the application live",
      "Renaming columns, changing data types, or restructuring tables in a system with multiple consuming applications"
    ],
    "when_to_use_zh": [
      "在零停机的生产环境中通过滚动部署发布模式变更",
      "作为单体到微服务拆分的一部分拆分共享数据库",
      "在保持应用在线的同时从一个数据库供应商迁移到另一个",
      "在有多个消费应用的系统中重命名列、更改数据类型或重构表"
    ],
    "core_concepts": [
      "Expand-Contract (Parallel Change): Add new structures, migrate data, then remove old structures in separate deployments",
      "Version-controlled migrations: Every schema change is a numbered, sequential, idempotent script stored in source control",
      "Backward compatibility: Both old and new application code must work against the database during the migration window",
      "Blue-green database: Maintaining two schema versions simultaneously so deployments can be rolled back safely",
      "Data migration vs. schema migration: Separating structural changes from data transformation for clarity and safety"
    ],
    "core_concepts_zh": [
      "扩展-收缩（并行变更）：先新增结构，迁移数据，再在独立部署中删除旧结构",
      "版本控制迁移：每个模式变更都是存储在源代码控制中的编号、顺序、幂等脚本",
      "向后兼容：在迁移窗口期间，新旧应用代码都必须能够操作数据库",
      "蓝绿数据库：同时维护两个模式版本，以便可以安全回滚部署",
      "数据迁移与模式迁移：将结构变更与数据转换分离，以提高清晰度和安全性"
    ],
    "timeline": [
      [
        "2003",
        "Scott Ambler publishes early work on agile database techniques and evolutionary design"
      ],
      [
        "2006",
        "Sadalage and Ambler publish 'Refactoring Databases', codifying expand-contract and migration patterns"
      ],
      [
        "2010",
        "Flyway is released as an open-source database migration tool, popularizing version-controlled migrations"
      ],
      [
        "2014",
        "Zero-downtime migration becomes a standard requirement for cloud-native continuous deployment"
      ],
      [
        "2020",
        "Tools like gh-ost (GitHub) and pt-online-schema-change (Percona) handle large-scale online migrations at internet scale"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "Scott Ambler 发表关于敏捷数据库技术和演进式设计的早期作品"
      ],
      [
        "2006",
        "Sadalage 和 Ambler 出版《数据库重构》，将扩展-收缩和迁移模式编纂成书"
      ],
      [
        "2010",
        "Flyway 作为开源数据库迁移工具发布，推广了版本控制迁移"
      ],
      [
        "2014",
        "零停机迁移成为云原生持续部署的标准要求"
      ],
      [
        "2020",
        "gh-ost（GitHub）和 pt-online-schema-change（Percona）等工具在互联网规模下处理大规模在线迁移"
      ]
    ],
    "dos": [
      "Always use the expand-contract pattern for destructive changes (column drops, renames, type changes)",
      "Test every migration on a production-sized dataset — small test databases hide performance problems",
      "Include rollback scripts for every forward migration and test them automatically in CI",
      "Separate data migrations from schema migrations to keep each change small and reviewable"
    ],
    "dos_zh": [
      "对破坏性变更（删列、重命名、类型更改）始终使用扩展-收缩模式",
      "在生产规模的数据集上测试每次迁移——小型测试数据库会隐藏性能问题",
      "为每个正向迁移编写回滚脚本并在 CI 中自动测试",
      "将数据迁移与模式迁移分离，使每个变更小巧且易于审查"
    ],
    "donts": [
      "Don't apply destructive schema changes (DROP COLUMN, ALTER TYPE) in a single deployment step",
      "Don't skip testing migrations against production-sized data — 'works on my laptop' is not enough",
      "Don't mix application logic changes with schema migrations in the same deployment",
      "Don't assume your ORM handles migrations safely — always review the generated SQL"
    ],
    "donts_zh": [
      "不要在单次部署步骤中应用破坏性模式变更（DROP COLUMN、ALTER TYPE）",
      "不要跳过在生产规模数据上测试迁移——「在我笔记本上能跑」远远不够",
      "不要在同一次部署中混合应用逻辑变更和模式迁移",
      "不要假设 ORM 能安全处理迁移——始终审查生成的 SQL"
    ],
    "case_study_company": "GitHub",
    "case_study": "GitHub developed gh-ost (GitHub Online Schema Transmogrifier) to handle zero-downtime schema migrations on their massive MySQL databases. Traditional ALTER TABLE operations would lock tables for hours, causing outages. gh-ost creates a shadow table, streams changes via the binary log, and performs an atomic cutover. It has been used to migrate tables with billions of rows across GitHub's production infrastructure without any user-facing downtime.",
    "case_study_zh": "GitHub 开发了 gh-ost（GitHub 在线模式变形器）来处理其大型 MySQL 数据库的零停机模式迁移。传统的 ALTER TABLE 操作会锁表数小时导致服务中断。gh-ost 创建影子表，通过二进制日志流式传输变更，并执行原子切换。它已被用于在 GitHub 生产基础设施中迁移拥有数十亿行的表，且无任何用户可见的停机。",
    "when_not_to_use": [
      "Development or test environments where downtime during migration is perfectly acceptable",
      "Greenfield projects with no existing data — just create the schema directly",
      "Embedded databases or single-user applications where concurrent access is not a concern",
      "Schema changes are so trivial (e.g., adding a nullable column) that no special pattern is needed"
    ],
    "when_not_to_use_zh": [
      "开发或测试环境中迁移期间停机完全可以接受",
      "没有现有数据的全新项目——直接创建模式即可",
      "嵌入式数据库或单用户应用，不存在并发访问问题",
      "模式变更极其简单（如添加可空列），不需要特殊模式"
    ],
    "adopters": [
      "GitHub",
      "Shopify",
      "Stripe",
      "Airbnb",
      "SoundCloud"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Pramod Sadalage and Scott Ambler (2006). \"Refactoring Databases: Evolutionary Database Design\". Addison-Wesley.",
    "secondary_sources": [
      "Martin Fowler and Pramod Sadalage (2003). \"Evolutionary Database Design\". martinfowler.com.",
      "Scott Ambler and Pramod Sadalage (2006). \"Agile Database Techniques\". Wiley."
    ],
    "typed_relations": [
      {
        "slug": "strangler-fig-pattern",
        "type": "complement"
      },
      {
        "slug": "branch-by-abstraction",
        "type": "complement"
      },
      {
        "slug": "api-versioning-strategies",
        "type": "complement"
      }
    ]
  },
  {
    "id": 83,
    "name": "AI-Assisted Refactoring",
    "name_zh": "AI 辅助重构",
    "slug": "ai-assisted-refactoring",
    "category": "evolution",
    "desc": "Use LLMs to identify, plan, and execute code refactoring at scale",
    "desc_zh": "利用大语言模型规模化识别、规划和执行代码重构",
    "steps": [
      "Feed the target codebase to an LLM and prompt it to identify code smells, duplication, and anti-patterns",
      "Generate a prioritized refactoring backlog with risk assessments and estimated effort from LLM output",
      "Use LLM-generated diffs or code completions for each refactoring task, reviewed by engineers",
      "Run the full test suite and fitness functions after each AI-generated change to catch regressions",
      "Track refactoring velocity and code quality metrics over time to measure AI-assistance ROI"
    ],
    "steps_zh": [
      "将目标代码库输入 LLM，提示其识别代码异味、重复和反模式",
      "根据 LLM 输出，生成带有风险评估和工作量估算的优先级重构待办清单",
      "使用 LLM 生成的差异或代码补全完成每项重构任务，并由工程师审查",
      "每次 AI 生成变更后运行完整测试套件和适应度函数，捕捉回归问题",
      "持续追踪重构速度和代码质量指标，量化 AI 辅助的投入产出比"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Analyze",
      "Suggest",
      "Apply",
      "Verify"
    ],
    "viz_labels_zh": [
      "分析代码",
      "AI建议",
      "应用变更",
      "验证"
    ],
    "related": [
      "mikado-method",
      "technical-debt-quadrant",
      "ai-pair-programming"
    ],
    "tags": [
      "ai-refactoring",
      "code-quality",
      "llm",
      "automation"
    ],
    "origin_author": "Industry practice, 2022",
    "origin_source": "Emerged from GitHub Copilot, ChatGPT, and large-scale LLM adoption in software engineering",
    "origin_source_zh": "源自 GitHub Copilot、ChatGPT 及大语言模型在软件工程中的大规模应用",
    "complexity": "intermediate",
    "when_to_use": [
      "A large legacy codebase with thousands of similar code smells that are tedious to fix manually",
      "Migrating framework versions (e.g., React class components to hooks) across hundreds of files",
      "Standardizing coding patterns across a monorepo with contributions from multiple teams",
      "Identifying and eliminating dead code, unused imports, or deprecated API calls at scale"
    ],
    "when_to_use_zh": [
      "大型遗留代码库中有数千个相似的代码异味，手动修复过于繁琐",
      "跨数百个文件迁移框架版本（如 React 类组件迁移至 Hooks）",
      "在多团队贡献的单仓库中标准化编码模式",
      "规模化识别和消除死代码、未使用导入或已弃用 API 调用"
    ],
    "core_concepts": [
      "LLM-driven code analysis: Using large language models to scan codebases for patterns, smells, and improvement opportunities",
      "AI-generated diffs: LLMs produce specific code changes (diffs) that engineers review before merging",
      "Human-in-the-loop: Every AI-generated change must be reviewed by a human engineer for correctness and intent",
      "Test-guarded refactoring: The existing test suite and fitness functions serve as a safety net for AI-generated changes",
      "Batch refactoring: Applying the same LLM-driven transformation across many files simultaneously for economies of scale"
    ],
    "core_concepts_zh": [
      "LLM 驱动的代码分析：使用大语言模型扫描代码库，识别模式、异味和改进机会",
      "AI 生成的差异：LLM 生成具体的代码变更（diff），由工程师审查后合并",
      "人在回路中：每个 AI 生成的变更必须由人类工程师审查其正确性和意图",
      "测试保障重构：现有测试套件和适应度函数作为 AI 生成变更的安全网",
      "批量重构：将相同的 LLM 驱动转换同时应用于多个文件，实现规模效益"
    ],
    "timeline": [
      [
        "2021",
        "GitHub Copilot technical preview launches, enabling AI-assisted code completion"
      ],
      [
        "2022",
        "ChatGPT release sparks widespread exploration of LLMs for code refactoring tasks"
      ],
      [
        "2023",
        "Tools like Sourcegraph Cody and Amazon CodeWhisperer add refactoring-specific capabilities"
      ],
      [
        "2024",
        "Large enterprises report using AI agents for automated codemod generation and tech debt reduction"
      ],
      [
        "2025",
        "Claude Code, Cursor, and similar tools enable agentic multi-file refactoring workflows"
      ]
    ],
    "timeline_zh": [
      [
        "2021",
        "GitHub Copilot 技术预览版发布，实现 AI 辅助代码补全"
      ],
      [
        "2022",
        "ChatGPT 的发布引发了将 LLM 用于代码重构任务的广泛探索"
      ],
      [
        "2023",
        "Sourcegraph Cody 和 Amazon CodeWhisperer 等工具增加了重构专属功能"
      ],
      [
        "2024",
        "大型企业报告使用 AI 智能体进行自动化 codemod 生成和技术债务削减"
      ],
      [
        "2025",
        "Claude Code、Cursor 等工具实现了智能体式多文件重构工作流"
      ]
    ],
    "dos": [
      "Always run the full test suite after every AI-generated change — never merge untested AI output",
      "Start with mechanical, pattern-based refactorings (renames, import updates) where LLMs are most reliable",
      "Use AI to generate the initial diff, then have engineers refine it — not the other way around",
      "Track before/after code quality metrics to quantify the ROI of AI-assisted refactoring"
    ],
    "dos_zh": [
      "每次 AI 生成变更后始终运行完整测试套件——绝不合并未经测试的 AI 输出",
      "从机械的、基于模式的重构（重命名、导入更新）开始，这是 LLM 最可靠的领域",
      "用 AI 生成初始差异，再由工程师精修——而非反过来",
      "追踪重构前后的代码质量指标，量化 AI 辅助重构的投入产出比"
    ],
    "donts": [
      "Don't blindly merge AI-generated code without human review — LLMs can introduce subtle bugs",
      "Don't use AI refactoring on code without test coverage — there's no safety net to catch regressions",
      "Don't attempt complex architectural refactoring (e.g., extracting microservices) via AI alone",
      "Don't ignore the context window limitation — LLMs may miss dependencies outside their visible scope"
    ],
    "donts_zh": [
      "不要在没有人工审查的情况下盲目合并 AI 生成的代码——LLM 可能引入微妙的 bug",
      "不要在没有测试覆盖的代码上使用 AI 重构——没有安全网来捕获回归",
      "不要试图仅通过 AI 完成复杂的架构重构（如提取微服务）",
      "不要忽视上下文窗口限制——LLM 可能遗漏其可见范围之外的依赖"
    ],
    "case_study_company": "Google",
    "case_study": "Google has used large-scale automated refactoring tooling (Rosie, part of their internal code analysis system) for over a decade. With the advent of LLMs, they integrated AI-assisted code transformation into their internal toolchain to handle migrations like API deprecations and style standardizations across their multi-billion-line monorepo. AI-generated changelists are reviewed by engineers and must pass all automated tests before submission, enabling them to complete refactorings that would previously take months in a fraction of the time.",
    "case_study_zh": "Google 十多年来一直使用大规模自动化重构工具（Rosie，其内部代码分析系统的一部分）。随着 LLM 的出现，他们将 AI 辅助代码转换集成到内部工具链中，用于处理数十亿行单仓库中的 API 弃用和风格标准化等迁移任务。AI 生成的变更列表由工程师审查，且必须通过所有自动化测试后才能提交，使他们能够在以前需要数月的时间内完成重构。",
    "when_not_to_use": [
      "Security-critical code where AI-introduced subtle bugs could have catastrophic consequences",
      "Codebases with zero or near-zero test coverage — there is no safety net for AI changes",
      "One-off refactorings that are faster to do manually than to set up an AI pipeline",
      "Performance-critical hot paths where AI-generated code may not meet stringent optimization requirements"
    ],
    "when_not_to_use_zh": [
      "安全关键代码，AI 引入的微妙 bug 可能造成灾难性后果",
      "测试覆盖率为零或接近于零的代码库——AI 变更没有安全网",
      "一次性的重构，手动完成比搭建 AI 管线更快",
      "性能关键的热路径，AI 生成的代码可能无法满足严格的优化要求"
    ],
    "adopters": [
      "Google",
      "Meta",
      "Shopify",
      "Sourcegraph",
      "Uber"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "GitHub (2021). \"GitHub Copilot: Your AI Pair Programmer\". github.com/features/copilot.",
    "secondary_sources": [
      "Mark Chen et al. (2021). \"Evaluating Large Language Models Trained on Code (Codex)\". arXiv:2107.03374.",
      "Martin Fowler (2018). \"Refactoring: Improving the Design of Existing Code, 2nd Edition\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "mikado-method",
        "type": "complement"
      },
      {
        "slug": "technical-debt-quadrant",
        "type": "complement"
      },
      {
        "slug": "ai-pair-programming",
        "type": "complement"
      }
    ]
  },
  {
    "id": 84,
    "name": "Continuous Architecture",
    "name_zh": "持续架构",
    "slug": "continuous-architecture",
    "category": "evolution",
    "desc": "Evolve architecture incrementally in sync with delivery rather than big up-front design",
    "desc_zh": "与交付节奏同步地增量演进架构，取代前期大设计",
    "steps": [
      "Establish lightweight Architecture Decision Records (ADRs) to capture decisions as they are made",
      "Embed architectural reviews as a routine part of sprint planning and retrospectives",
      "Define architectural runways: maintain just-enough design ahead of the development queue",
      "Use fitness functions to monitor whether the live system still matches architectural intent",
      "Revisit and update ADRs when new information, scale demands, or AI capabilities shift priorities"
    ],
    "steps_zh": [
      "建立轻量级架构决策记录（ADR），在决策作出时即时捕获",
      "将架构评审作为迭代计划和回顾会议的常规环节",
      "定义架构跑道：在开发队列前方维持恰到好处的设计储备",
      "使用适应度函数监控线上系统是否仍与架构意图保持一致",
      "当新信息、规模需求或 AI 能力改变优先级时，及时重审和更新 ADR"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Decide",
      "Implement",
      "Evaluate",
      "Adapt"
    ],
    "viz_labels_zh": [
      "架构决策",
      "实施",
      "评估",
      "适应演进"
    ],
    "related": [
      "adr",
      "architectural-fitness-functions",
      "three-ways-devops"
    ],
    "tags": [
      "continuous-architecture",
      "evolutionary",
      "agile",
      "incremental"
    ],
    "origin_author": "Murat Erder & Pierre Pureur, 2015",
    "origin_source": "Continuous Architecture: Sustainable Architecture in an Agile and Cloud-Centric World (Morgan Kaufmann)",
    "origin_source_zh": "《持续架构：敏捷和云中心化世界中的可持续架构》（Morgan Kaufmann 出版）",
    "complexity": "intermediate",
    "when_to_use": [
      "An agile organization that needs architectural guidance without reverting to waterfall-style big design up front",
      "A rapidly growing system where requirements change faster than a fixed architecture can accommodate",
      "Teams adopting cloud-native infrastructure and needing architecture that evolves with cloud capabilities",
      "Organizations integrating AI/ML components where architectural assumptions shift as models and capabilities improve"
    ],
    "when_to_use_zh": [
      "敏捷组织需要架构指导，但不想回归瀑布式大前期设计",
      "快速增长的系统，需求变化速度超出固定架构的适应能力",
      "团队采用云原生基础设施，需要随云能力演进的架构",
      "整合 AI/ML 组件的组织，架构假设随模型和能力提升而变化"
    ],
    "core_concepts": [
      "Architecture Decision Records (ADRs): Lightweight, version-controlled documents that capture the context, decision, and consequences of each architectural choice",
      "Architectural runway: Just enough planned architecture to support the next few sprints, not a complete blueprint",
      "Last responsible moment: Defer architectural decisions until the cost of not deciding outweighs the benefit of waiting for more information",
      "Fitness functions: Automated checks that continuously verify the live system conforms to architectural intent",
      "Architecture as code: Treating architectural constraints as executable specifications embedded in CI/CD pipelines"
    ],
    "core_concepts_zh": [
      "架构决策记录（ADR）：轻量级、版本控制的文档，记录每个架构选择的上下文、决策和后果",
      "架构跑道：仅规划足够支撑未来几个迭代的架构，而非完整蓝图",
      "最后责任时刻：将架构决策推迟到不做决策的成本超过等待更多信息的收益时",
      "适应度函数：持续验证线上系统符合架构意图的自动化检查",
      "架构即代码：将架构约束视为嵌入 CI/CD 流水线的可执行规格说明"
    ],
    "timeline": [
      [
        "2010",
        "SAFe introduces the concept of Architectural Runway for agile-at-scale"
      ],
      [
        "2011",
        "Michael Nygard proposes lightweight Architecture Decision Records (ADRs)"
      ],
      [
        "2015",
        "Erder and Pureur publish 'Continuous Architecture', codifying the approach"
      ],
      [
        "2017",
        "'Building Evolutionary Architectures' by Ford, Parsons & Kua adds fitness functions to continuous architecture"
      ],
      [
        "2021",
        "Erder, Pureur & Woods publish 'Continuous Architecture in Practice', updating the approach for cloud-native and AI"
      ]
    ],
    "timeline_zh": [
      [
        "2010",
        "SAFe 引入架构跑道概念，用于规模化敏捷"
      ],
      [
        "2011",
        "Michael Nygard 提出轻量级架构决策记录（ADR）"
      ],
      [
        "2015",
        "Erder 和 Pureur 出版《持续架构》，将该方法编纂成书"
      ],
      [
        "2017",
        "Ford、Parsons 和 Kua 的《演进式架构》为持续架构增加了适应度函数"
      ],
      [
        "2021",
        "Erder、Pureur 和 Woods 出版《持续架构实践》，针对云原生和 AI 更新方法"
      ]
    ],
    "dos": [
      "Write ADRs for every significant architectural decision — even if the decision is 'we chose not to change'",
      "Keep the architectural runway 1-2 sprints ahead, not 6 months ahead",
      "Embed architects in delivery teams rather than isolating them in an ivory tower review board",
      "Use fitness functions to make architectural constraints executable and automatically enforced"
    ],
    "dos_zh": [
      "为每个重要架构决策编写 ADR——即使决策是「我们选择不变更」",
      "将架构跑道保持在 1-2 个迭代之前，而非 6 个月之前",
      "让架构师嵌入交付团队，而非将他们隔离在象牙塔评审委员会中",
      "使用适应度函数使架构约束可执行并自动强制执行"
    ],
    "donts": [
      "Don't fall back to big up-front design disguised as 'architectural planning' — keep it lightweight",
      "Don't skip ADRs for 'obvious' decisions — they provide invaluable context for future team members",
      "Don't treat architecture as a one-time activity — it must continuously evolve with the system",
      "Don't let the architectural runway grow too far ahead of delivery — unused design becomes waste"
    ],
    "donts_zh": [
      "不要回退到伪装成「架构规划」的大前期设计——保持轻量",
      "不要跳过「显而易见」决策的 ADR——它们为未来团队成员提供宝贵上下文",
      "不要将架构视为一次性活动——它必须与系统持续共同演进",
      "不要让架构跑道远远领先于交付——未被使用的设计就是浪费"
    ],
    "case_study_company": "Spotify",
    "case_study": "Spotify adopted continuous architecture principles as part of their famous squad/tribe/chapter model. Each squad makes its own architectural decisions, documented as lightweight ADRs in their repositories. A chapter of architects across squads ensures alignment on cross-cutting concerns. This approach allowed Spotify to evolve from a monolith to hundreds of microservices without ever pausing delivery for a 'big redesign', enabling them to scale from 10 million to over 500 million users.",
    "case_study_zh": "Spotify 将持续架构原则作为其著名的小队/部落/分会模型的一部分加以采用。每个小队自主做出架构决策，以轻量级 ADR 形式记录在各自仓库中。跨小队的架构师分会确保在横切关注点上保持一致。这种方法使 Spotify 从单体演进到数百个微服务，而无需暂停交付进行「大重设计」，支撑其从 1000 万用户扩展到超过 5 亿用户。",
    "when_not_to_use": [
      "Safety-critical systems (medical devices, avionics) where regulatory compliance requires extensive up-front design documentation",
      "Fixed-scope contract projects where the architecture is contractually specified before development begins",
      "Very short-lived projects (under 3 months) where the overhead of ADRs and fitness functions exceeds their benefit",
      "Teams with no agile delivery practices — continuous architecture assumes iterative, incremental delivery"
    ],
    "when_not_to_use_zh": [
      "安全关键系统（医疗设备、航空电子），法规合规要求大量前期设计文档",
      "固定范围的合同项目，架构在开发开始前已在合同中明确规定",
      "非常短期的项目（3 个月以下），ADR 和适应度函数的开销超过收益",
      "没有敏捷交付实践的团队——持续架构假设迭代、增量式交付"
    ],
    "adopters": [
      "Spotify",
      "Capital One",
      "Netflix",
      "ING Bank",
      "ThoughtWorks"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Murat Erder and Pierre Pureur (2015). \"Continuous Architecture: Sustainable Architecture in an Agile and Cloud-Centric World\". Morgan Kaufmann.",
    "secondary_sources": [
      "Murat Erder, Pierre Pureur, and Eoin Woods (2021). \"Continuous Architecture in Practice\". Addison-Wesley.",
      "Neal Ford, Rebecca Parsons, and Patrick Kua (2017). \"Building Evolutionary Architectures\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "adr",
        "type": "complement"
      },
      {
        "slug": "architectural-fitness-functions",
        "type": "complement"
      },
      {
        "slug": "three-ways-devops",
        "type": "complement"
      }
    ]
  },
  {
    "id": 85,
    "name": "Evolutionary Agent Systems",
    "name_zh": "演进式智能体系统",
    "slug": "evolutionary-agent-systems",
    "category": "evolution",
    "desc": "Iteratively evolve AI agent architectures using feedback, eval, and capability staging",
    "desc_zh": "通过反馈、评估和能力分级持续迭代演进 AI 智能体架构",
    "steps": [
      "Define agent capability tiers and establish eval suites that benchmark each capability level",
      "Deploy agents in a shadow or canary mode, collecting traces, errors, and human feedback",
      "Use evals and failure analysis to identify the next highest-value capability to add or fix",
      "Introduce new tools, memory stores, or orchestration patterns behind feature flags",
      "Promote changes incrementally using A/B testing on agent outputs, retiring deprecated behaviors"
    ],
    "steps_zh": [
      "定义智能体能力层级，并建立对每个能力层级进行基准测试的评估套件",
      "以影子模式或金丝雀模式部署智能体，收集调用链、错误和人工反馈",
      "使用评估结果和失败分析，确定下一个最高价值的待新增或待修复能力",
      "在功能开关后引入新工具、记忆存储或编排模式",
      "通过对智能体输出进行 A/B 测试，增量晋升变更，退役已弃用的行为"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Deploy",
      "Observe",
      "Retrain",
      "Evolve"
    ],
    "viz_labels_zh": [
      "部署",
      "观测",
      "重训练",
      "演化"
    ],
    "related": [
      "agent-deployment-patterns",
      "agent-reliability-patterns",
      "continuous-architecture"
    ],
    "tags": [
      "agent-evolution",
      "capability-staging",
      "evaluation",
      "ai"
    ],
    "origin_author": "Industry practice, 2023",
    "origin_source": "Emerged from production AI agent deployments at Anthropic, OpenAI, Google DeepMind, and applied AI companies",
    "origin_source_zh": "源自 Anthropic、OpenAI、Google DeepMind 及应用 AI 公司的生产 AI 智能体部署实践",
    "complexity": "advanced",
    "when_to_use": [
      "Deploying an AI coding assistant that needs to progressively gain access to more tools and repositories",
      "Building a customer support agent that evolves from FAQ handling to complex issue resolution",
      "Operating multi-agent systems where new agent roles must be introduced and validated incrementally",
      "Evolving an AI data analyst from simple queries to complex multi-step reasoning workflows"
    ],
    "when_to_use_zh": [
      "部署需要逐步获取更多工具和仓库访问权限的 AI 编码助手",
      "构建从 FAQ 处理演进到复杂问题解决的客户支持智能体",
      "运行多智能体系统，需要增量引入和验证新的智能体角色",
      "将 AI 数据分析师从简单查询演进到复杂多步推理工作流"
    ],
    "core_concepts": [
      "Capability tiers: A staged ladder of agent abilities, where each tier is unlocked after demonstrating reliability at the previous level",
      "Eval-driven development: Agent evolution is guided by quantitative eval suites that benchmark quality, safety, and reliability",
      "Shadow/canary deployment: New agent capabilities run alongside proven ones, with outputs compared but not exposed to users",
      "Human feedback loop: User corrections, thumbs-up/down, and escalation patterns drive the prioritization of capability improvements",
      "Graceful degradation: Agents must fall back to simpler, proven behaviors when newly introduced capabilities encounter edge cases"
    ],
    "core_concepts_zh": [
      "能力分级：智能体能力的阶梯式分层，每个层级在证明前一层级的可靠性后解锁",
      "评估驱动开发：智能体演进由量化评估套件引导，基准测试质量、安全性和可靠性",
      "影子/金丝雀部署：新智能体能力与已验证能力并行运行，输出被比对但不暴露给用户",
      "人类反馈循环：用户纠正、点赞/踩和升级模式驱动能力改进的优先排序",
      "优雅降级：当新引入的能力遇到边缘情况时，智能体必须回退到更简单、已验证的行为"
    ],
    "timeline": [
      [
        "2022",
        "Early LLM-based agents (AutoGPT, BabyAGI) demonstrate the need for structured agent evolution"
      ],
      [
        "2023",
        "Anthropic, OpenAI, and others publish best practices for safe, incremental AI agent deployment"
      ],
      [
        "2024",
        "Eval-driven agent development becomes standard practice with frameworks like Braintrust, Langsmith, and Weights & Biases"
      ],
      [
        "2025",
        "Multi-agent orchestration platforms mature, enabling teams to evolve complex agent systems with staged rollouts"
      ],
      [
        "2025",
        "Claude Code and similar agentic coding tools demonstrate practical evolutionary capability staging in production"
      ]
    ],
    "timeline_zh": [
      [
        "2022",
        "早期 LLM 智能体（AutoGPT、BabyAGI）揭示了结构化智能体演进的必要性"
      ],
      [
        "2023",
        "Anthropic、OpenAI 等发布安全、增量 AI 智能体部署的最佳实践"
      ],
      [
        "2024",
        "评估驱动的智能体开发成为标准实践，出现 Braintrust、Langsmith、Weights & Biases 等框架"
      ],
      [
        "2025",
        "多智能体编排平台成熟，使团队能够通过分阶段发布演进复杂智能体系统"
      ],
      [
        "2025",
        "Claude Code 等智能体编码工具在生产中展示了实际的演进式能力分级"
      ]
    ],
    "dos": [
      "Build comprehensive eval suites before adding new capabilities — you cannot improve what you cannot measure",
      "Deploy new capabilities in shadow mode first and compare outputs against the established baseline",
      "Implement graceful degradation so agent failures fall back to simpler, proven behaviors",
      "Collect structured human feedback on every agent interaction to guide capability prioritization"
    ],
    "dos_zh": [
      "在添加新能力之前构建全面的评估套件——无法衡量就无法改进",
      "先以影子模式部署新能力，将输出与已建立的基线进行比对",
      "实现优雅降级，使智能体故障能回退到更简单、已验证的行为",
      "在每次智能体交互中收集结构化的人类反馈，以指导能力优先级排序"
    ],
    "donts": [
      "Don't grant agents full tool access from day one — stage capabilities and earn trust incrementally",
      "Don't skip eval suites because 'the agent seems to work' — vibes-based evaluation misses critical failures",
      "Don't evolve multiple agent capabilities simultaneously — isolate changes to understand their individual impact",
      "Don't ignore safety boundaries — capability expansion must be accompanied by guardrails and monitoring"
    ],
    "donts_zh": [
      "不要从第一天就赋予智能体完整工具访问权限——分阶段开放能力并逐步赢得信任",
      "不要因为「智能体看起来能用」就跳过评估套件——凭感觉的评估会遗漏关键故障",
      "不要同时演进多个智能体能力——隔离变更以理解各自的独立影响",
      "不要忽视安全边界——能力扩展必须伴随护栏和监控"
    ],
    "case_study_company": "Anthropic",
    "case_study": "Anthropic evolved Claude's agentic capabilities through a staged approach with Claude Code and the tool-use API. Initial releases supported simple single-tool calls, evaluated against safety and accuracy benchmarks. Subsequent capability tiers added multi-tool orchestration, file editing, and web browsing — each tier gated by eval suite performance. This evolutionary approach allowed Anthropic to ship agentic features to millions of developers while maintaining safety standards, with each capability tier building confidence from the reliability demonstrated at the previous level.",
    "case_study_zh": "Anthropic 通过 Claude Code 和工具使用 API 以分阶段方式演进 Claude 的智能体能力。初始版本支持简单的单工具调用，根据安全性和准确性基准进行评估。后续能力层级增加了多工具编排、文件编辑和网页浏览——每个层级都以评估套件表现为门控。这种演进式方法使 Anthropic 能够在保持安全标准的同时向数百万开发者发布智能体功能，每个能力层级都建立在前一层级所展示的可靠性基础上。",
    "when_not_to_use": [
      "Simple, stateless AI completions (e.g., text summarization) that don't require agentic capabilities",
      "Agents operating in fully sandboxed environments where safety risks are negligible",
      "One-off demo or proof-of-concept agents that won't be deployed to production",
      "Use cases where the full agent capability set is well-understood and thoroughly tested upfront"
    ],
    "when_not_to_use_zh": [
      "简单的无状态 AI 补全（如文本摘要），不需要智能体能力",
      "在完全沙箱化环境中运行的智能体，安全风险可忽略",
      "不会部署到生产的一次性演示或概念验证智能体",
      "完整智能体能力集已被充分理解并已提前彻底测试的场景"
    ],
    "adopters": [
      "Anthropic",
      "OpenAI",
      "Google DeepMind",
      "Microsoft",
      "Replit"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Anthropic (2024). \"Building Effective Agents\". anthropic.com.",
    "secondary_sources": [
      "Andrew Ng (2024). \"Agentic Design Patterns\". deeplearning.ai.",
      "Harrison Chase et al. (2023). \"LangGraph: Multi-Actor Agent Framework\". langchain.com."
    ],
    "typed_relations": [
      {
        "slug": "agent-deployment-patterns",
        "type": "complement"
      },
      {
        "slug": "agent-reliability-patterns",
        "type": "complement"
      },
      {
        "slug": "continuous-architecture",
        "type": "related"
      }
    ]
  },
  {
    "id": 86,
    "name": "Mikado Method",
    "name_zh": "米卡多方法",
    "slug": "mikado-method",
    "category": "evolution",
    "desc": "Visualize and untangle large refactorings as a dependency graph of small safe steps",
    "desc_zh": "将大型重构可视化为依赖图，分解为一系列小而安全的步骤",
    "steps": [
      "State the refactoring goal as a single root node and attempt it naively in the codebase",
      "When compilation or tests break, record each blocker as a child node in the Mikado Graph",
      "Revert all changes and recurse: attempt each leaf node (blocker) the same way",
      "Continue building the graph until all leaves are independently achievable without breakage",
      "Implement leaves first, working bottom-up through the graph until the root goal is achieved"
    ],
    "steps_zh": [
      "将重构目标设为单一根节点，在代码库中直接尝试实现",
      "当编译或测试失败时，将每个阻塞点记录为米卡多图中的子节点",
      "回滚所有变更并递归：用同样方式尝试每个叶节点（阻塞项）",
      "持续构建依赖图，直到所有叶节点均可独立完成而不引发问题",
      "从叶节点开始实现，自底向上逐步完成，直至达成根节点目标"
    ],
    "ai_relevant": false,
    "viz_type": "tree",
    "viz_labels": [
      "Goal",
      "Prerequisite",
      "Leaf Node",
      "Revert"
    ],
    "viz_labels_zh": [
      "目标变更",
      "前置条件",
      "叶子节点",
      "回滚"
    ],
    "related": [
      "technical-debt-quadrant",
      "ai-assisted-refactoring",
      "strangler-fig-pattern"
    ],
    "tags": [
      "mikado",
      "refactoring",
      "dependency-graph",
      "safe-steps"
    ],
    "origin_author": "Ola Ellnestam & Daniel Brolund, 2014",
    "origin_source": "The Mikado Method (Manning Publications)",
    "origin_source_zh": "《米卡多方法》（Manning 出版）",
    "complexity": "intermediate",
    "when_to_use": [
      "A large refactoring that keeps failing because hidden dependencies break things unexpectedly",
      "Untangling a deeply coupled legacy codebase where each change triggers a cascade of compilation errors",
      "Planning a multi-sprint refactoring effort where the team needs to see the full dependency map before starting",
      "Teaching junior developers how to approach large-scale refactoring safely and systematically"
    ],
    "when_to_use_zh": [
      "大型重构因隐藏依赖不断导致意外失败",
      "解耦深度耦合的遗留代码库，每次变更都引发连锁编译错误",
      "规划多迭代重构工作，团队需要在开始前看到完整的依赖图",
      "教授初级开发者如何安全、系统地进行大规模重构"
    ],
    "core_concepts": [
      "Mikado Graph: A directed acyclic graph where the root is the refactoring goal and children are prerequisites that must be satisfied first",
      "Naive attempt: Deliberately try the change and let it fail, using the failures to discover hidden dependencies",
      "Revert-and-recurse: After each failed attempt, revert all changes and apply the same process to each discovered prerequisite",
      "Leaf-first implementation: Work from the bottom of the graph upward, implementing only leaves that can be done independently",
      "Safe, small steps: Each leaf implementation is small enough to commit, test, and deploy independently without breaking the system"
    ],
    "core_concepts_zh": [
      "米卡多图：有向无环图，根节点是重构目标，子节点是必须先满足的前置条件",
      "直接尝试：有意地进行变更并让它失败，利用失败来发现隐藏的依赖",
      "回滚并递归：每次失败尝试后，回滚所有变更并对每个发现的前置条件应用相同过程",
      "叶节点优先实现：从图的底部向上工作，只实现可以独立完成的叶节点",
      "小而安全的步骤：每个叶节点的实现足够小，可以独立提交、测试和部署而不破坏系统"
    ],
    "timeline": [
      [
        "2008",
        "Ola Ellnestam and Daniel Brolund begin developing the Mikado Method at Swedish consultancy Agical"
      ],
      [
        "2010",
        "The method is presented at multiple agile and software craftsmanship conferences"
      ],
      [
        "2014",
        "Manning publishes 'The Mikado Method' book, making the technique accessible to a wider audience"
      ],
      [
        "2016",
        "The method gains traction in legacy modernization projects across European enterprises"
      ],
      [
        "2023",
        "Practitioners combine Mikado with AI-assisted refactoring, using LLMs to explore the dependency graph faster"
      ]
    ],
    "timeline_zh": [
      [
        "2008",
        "Ola Ellnestam 和 Daniel Brolund 在瑞典咨询公司 Agical 开始开发米卡多方法"
      ],
      [
        "2010",
        "该方法在多个敏捷和软件工匠精神大会上发表"
      ],
      [
        "2014",
        "Manning 出版《米卡多方法》一书，使该技术为更广泛的受众所知"
      ],
      [
        "2016",
        "该方法在欧洲企业的遗留系统现代化项目中获得广泛采用"
      ],
      [
        "2023",
        "实践者将米卡多方法与 AI 辅助重构结合，使用 LLM 更快速地探索依赖图"
      ]
    ],
    "dos": [
      "Always revert after a failed naive attempt — the point is to discover dependencies, not to force changes through",
      "Draw the Mikado Graph on a whiteboard or shared diagram so the whole team can see the plan",
      "Commit each leaf-level change independently so the main branch stays green throughout the refactoring",
      "Use the graph as a communication tool in stand-ups to track progress through the refactoring"
    ],
    "dos_zh": [
      "在直接尝试失败后始终回滚——目的是发现依赖关系，而非强行推进变更",
      "在白板或共享图表上绘制米卡多图，使整个团队都能看到计划",
      "独立提交每个叶节点级别的变更，使主分支在整个重构过程中保持绿色",
      "在站会中使用依赖图作为沟通工具，追踪重构进展"
    ],
    "donts": [
      "Don't skip the revert step — pushing through a broken state defeats the purpose of the method",
      "Don't try to implement non-leaf nodes before their children are complete — you'll break things again",
      "Don't build a huge graph in one session — discover dependencies incrementally as you work",
      "Don't use Mikado for simple, well-understood refactorings — the overhead is only justified for complex dependency tangles"
    ],
    "donts_zh": [
      "不要跳过回滚步骤——在破坏状态下强行推进违背了该方法的初衷",
      "不要在子节点完成之前尝试实现非叶节点——否则会再次破坏系统",
      "不要试图在一次会议中构建完整的依赖图——随着工作推进增量发现依赖",
      "不要将米卡多方法用于简单、易理解的重构——其开销只对复杂依赖纠缠才合理"
    ],
    "case_study_company": "Ericsson",
    "case_study": "The Mikado Method originated from real-world work at Ericsson's telecom infrastructure division, where engineers faced massive C++ codebases with deep dependency chains. Traditional refactoring attempts caused cascading build failures across hundreds of files. By systematically using the naive-attempt-revert-graph cycle, the team mapped out the full dependency tree for a major module extraction and completed it over several weeks through safe, incremental leaf-first commits — each one passing CI independently.",
    "case_study_zh": "米卡多方法源于爱立信电信基础设施部门的实际工作，工程师们面对具有深层依赖链的大型 C++ 代码库。传统的重构尝试会在数百个文件中引发连锁编译失败。通过系统性地使用直接尝试-回滚-建图的循环，团队为一次重大模块提取绘制出完整的依赖树，并在数周内通过安全的叶节点优先增量提交完成——每次提交都独立通过 CI。",
    "when_not_to_use": [
      "Small refactorings with obvious, well-known dependency chains that don't need graphing",
      "Codebases with comprehensive test coverage where failed attempts are immediately caught and understood",
      "Throwaway or prototype code where breaking things temporarily is acceptable",
      "Automated migration tools (e.g., codemods) can handle the refactoring mechanically without dependency analysis"
    ],
    "when_not_to_use_zh": [
      "依赖链显而易见且众所周知的小型重构，不需要绘制依赖图",
      "测试覆盖全面的代码库，失败的尝试能立即被捕获和理解",
      "一次性或原型代码，临时破坏是可以接受的",
      "自动化迁移工具（如 codemod）可以机械地处理重构而无需依赖分析"
    ],
    "adopters": [
      "Ericsson",
      "Agical",
      "Spotify",
      "King (Candy Crush)",
      "Klarna"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Ola Ellnestam and Daniel Brolund (2014). \"The Mikado Method\". Manning Publications.",
    "secondary_sources": [
      "Michael Feathers (2004). \"Working Effectively with Legacy Code\". Prentice Hall.",
      "Martin Fowler (2018). \"Refactoring: Improving the Design of Existing Code, 2nd Edition\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "technical-debt-quadrant",
        "type": "complement"
      },
      {
        "slug": "ai-assisted-refactoring",
        "type": "complement"
      },
      {
        "slug": "strangler-fig-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 187,
    "name": "Evolutionary Database Design",
    "name_zh": "演进式数据库设计",
    "slug": "evolutionary-database-design",
    "category": "evolution",
    "desc": "Incremental schema changes (Sadalage & Fowler)",
    "desc_zh": "增量式模式变更（Sadalage & Fowler）",
    "steps": [
      "Treat every database change as a versioned migration script stored in version control alongside application code",
      "Use expand-contract pattern: add new columns/tables first, migrate data, then remove old structures",
      "Automate migration execution in CI/CD pipelines with tools like Flyway, Liquibase, or Alembic",
      "Ensure every migration is backward-compatible so the previous application version can coexist during deployment",
      "Monitor migration performance and data integrity in production; maintain rollback scripts for every migration"
    ],
    "steps_zh": [
      "将每个数据库变更视为版本化的迁移脚本，与应用代码一同存储在版本控制中",
      "使用扩展-收缩模式：先添加新列/表，迁移数据，再移除旧结构",
      "使用 Flyway、Liquibase 或 Alembic 等工具在 CI/CD 流水线中自动执行迁移",
      "确保每次迁移向后兼容，使上一版本的应用在部署期间可以共存",
      "在生产环境中监控迁移性能和数据完整性；为每次迁移维护回滚脚本"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Migration",
      "Backward Compat",
      "Deploy",
      "Cleanup"
    ],
    "viz_labels_zh": [
      "数据迁移",
      "向后兼容",
      "部署",
      "清理旧字段"
    ],
    "related": [
      "strangler-fig-pattern",
      "branch-by-abstraction",
      "gitops",
      "api-versioning-strategies"
    ],
    "tags": [
      "database",
      "schema-migration",
      "evolutionary-design",
      "backward-compatibility"
    ],
    "origin_author": "Pramod Sadalage & Martin Fowler, 2002-2016",
    "origin_source": "Refactoring Databases: Evolutionary Database Design (Pramod Sadalage & Scott Ambler, 2006); Evolutionary Database Design (martinfowler.com)",
    "origin_source_zh": "《数据库重构：演进式数据库设计》（Pramod Sadalage & Scott Ambler，2006）；演进式数据库设计（martinfowler.com）",
    "complexity": "intermediate",
    "when_to_use": [
      "Applications with continuous delivery pipelines that deploy multiple times per day",
      "Systems where database changes must not cause downtime or break running application instances",
      "Teams migrating from manual DBA-controlled schema changes to developer-owned database evolution",
      "Microservices architectures where each service owns its database schema independently"
    ],
    "when_to_use_zh": [
      "每天部署多次的持续交付流水线应用",
      "数据库变更不能导致停机或破坏运行中应用实例的系统",
      "从手动 DBA 控制的模式变更迁移至开发者自主数据库演进的团队",
      "每个服务独立拥有数据库模式的微服务架构"
    ],
    "core_concepts": [
      "Migration as Code: Database changes are first-class code artifacts — versioned, reviewed, tested, and deployed through the same pipeline as application code",
      "Expand-Contract Pattern: A two-phase approach where the schema is first expanded to support both old and new formats, then contracted to remove the old format after migration",
      "Backward Compatibility: Every migration must allow the previous application version to function correctly, enabling zero-downtime deployments",
      "Transition Period: The window during which both old and new schema elements coexist, requiring careful data synchronization and dual-write strategies"
    ],
    "core_concepts_zh": [
      "迁移即代码：数据库变更是一等代码制品——通过与应用代码相同的流水线进行版本化、评审、测试和部署",
      "扩展-收缩模式：两阶段方法，先扩展模式以同时支持新旧格式，迁移后再收缩以移除旧格式",
      "向后兼容：每次迁移必须允许上一版本应用正常运行，实现零停机部署",
      "过渡期：新旧模式元素共存的窗口期，需要仔细的数据同步和双写策略"
    ],
    "timeline": [
      [
        "2002",
        "Pramod Sadalage pioneers evolutionary database techniques at ThoughtWorks projects"
      ],
      [
        "2006",
        "Sadalage and Ambler publish Refactoring Databases, the seminal book on evolutionary database design"
      ],
      [
        "2010",
        "Flyway 1.0 released, bringing automated versioned migrations to the Java ecosystem"
      ],
      [
        "2016",
        "Martin Fowler publishes the updated Evolutionary Database Design article on martinfowler.com"
      ],
      [
        "2022",
        "Schema-as-code tools (Atlas, PlanetScale) bring evolutionary database design to cloud-native and serverless databases"
      ]
    ],
    "timeline_zh": [
      [
        "2002",
        "Pramod Sadalage 在 ThoughtWorks 项目中率先实践演进式数据库技术"
      ],
      [
        "2006",
        "Sadalage 和 Ambler 出版《数据库重构》，成为演进式数据库设计的开创性著作"
      ],
      [
        "2010",
        "Flyway 1.0 发布，为 Java 生态带来自动化版本迁移"
      ],
      [
        "2016",
        "Martin Fowler 在 martinfowler.com 发表更新版的演进式数据库设计文章"
      ],
      [
        "2022",
        "模式即代码工具（Atlas、PlanetScale）将演进式数据库设计带入云原生和无服务器数据库"
      ]
    ],
    "dos": [
      "Version every migration script with a sequential or timestamp-based identifier for ordering",
      "Test migrations against a production-like dataset — empty-database tests miss performance and data-integrity issues",
      "Use the expand-contract pattern for every breaking change to maintain backward compatibility during deployment",
      "Include both up and down migration scripts so any change can be rolled back safely"
    ],
    "dos_zh": [
      "使用顺序或时间戳标识符为每个迁移脚本进行版本编号",
      "针对类生产数据集测试迁移——空数据库测试会遗漏性能和数据完整性问题",
      "对每个破坏性变更使用扩展-收缩模式，在部署期间维持向后兼容",
      "为每个变更提供正向和反向迁移脚本，确保任何变更都可以安全回滚"
    ],
    "donts": [
      "Don't apply migrations manually in production — always use automated tooling through CI/CD pipelines",
      "Don't make destructive changes (DROP COLUMN, DROP TABLE) in the same migration as constructive ones",
      "Don't skip the transition period — removing old schema elements before all consumers have migrated causes outages",
      "Don't store migration state outside the database — use a dedicated migrations table for single-source-of-truth tracking"
    ],
    "donts_zh": [
      "不要在生产环境手动执行迁移——始终通过 CI/CD 流水线使用自动化工具",
      "不要在同一次迁移中将破坏性变更（DROP COLUMN、DROP TABLE）与构建性变更混合",
      "不要跳过过渡期——在所有消费者迁移完成前移除旧模式元素会导致中断",
      "不要将迁移状态存储在数据库外——使用专用的迁移表作为唯一事实来源"
    ],
    "case_study_company": "ThoughtWorks",
    "case_study": "ThoughtWorks pioneered evolutionary database design across dozens of enterprise client projects from 2002 onward. On a major retail banking transformation, the team managed over 400 incremental schema migrations across 18 months without a single deployment-related outage. By enforcing the expand-contract pattern and automated migration testing in CI, they enabled daily deployments on a database serving 12 million customer accounts.",
    "case_study_zh": "ThoughtWorks 从 2002 年开始在数十个企业客户项目中率先实践演进式数据库设计。在一次大型零售银行转型中，团队在 18 个月内管理了 400 多次增量模式迁移，未发生过一次与部署相关的中断。通过强制执行扩展-收缩模式和 CI 中的自动化迁移测试，他们在服务 1200 万客户账户的数据库上实现了每日部署。",
    "when_not_to_use": [
      "Greenfield projects with no existing data where schema can be freely redesigned without migration overhead",
      "Analytical/data warehouse schemas designed for batch ETL rather than continuous deployment",
      "Systems using schema-less databases (document stores) where schema evolution is handled at the application layer",
      "One-time data migrations where there is no ongoing need for incremental schema evolution"
    ],
    "when_not_to_use_zh": [
      "没有现有数据的全新项目——可以自由重新设计模式而无需迁移开销",
      "为批处理 ETL 而非持续部署设计的分析/数据仓库模式",
      "使用无模式数据库（文档存储）、模式演进在应用层处理的系统",
      "一次性数据迁移——不需要持续增量模式演进"
    ],
    "adopters": [
      "ThoughtWorks",
      "Spotify",
      "Shopify",
      "GitHub",
      "Stripe"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Pramod Sadalage and Scott Ambler (2006). \"Refactoring Databases: Evolutionary Database Design\". Addison-Wesley.",
    "secondary_sources": [
      "Martin Fowler and Pramod Sadalage (2003). \"Evolutionary Database Design\". martinfowler.com.",
      "Flyway Team (2010). \"Flyway: Database Migrations Made Easy\". flywaydb.org."
    ],
    "typed_relations": [
      {
        "slug": "strangler-fig-pattern",
        "type": "complement"
      },
      {
        "slug": "branch-by-abstraction",
        "type": "complement"
      },
      {
        "slug": "gitops",
        "type": "complement"
      },
      {
        "slug": "api-versioning-strategies",
        "type": "complement"
      }
    ]
  },
  {
    "id": 188,
    "name": "Feature Branch Strategy",
    "name_zh": "特性分支策略",
    "slug": "feature-branch-strategy",
    "category": "evolution",
    "desc": "Git branching models (GitFlow, trunk-based)",
    "desc_zh": "Git 分支模型（GitFlow、主干开发等）",
    "steps": [
      "Choose a branching model aligned with team size and release cadence (GitFlow, GitHub Flow, trunk-based)",
      "Define branch naming conventions, lifecycle policies, and merge/rebase rules",
      "Enforce branch protection rules: required reviews, CI pass, and no direct pushes to main/trunk",
      "Keep feature branches short-lived (hours to days) to minimize merge conflicts and integration risk",
      "Merge to mainline via pull request after passing automated tests, code review, and any required approvals"
    ],
    "steps_zh": [
      "选择与团队规模和发布节奏匹配的分支模型（GitFlow、GitHub Flow、主干开发）",
      "定义分支命名规范、生命周期策略和合并/变基规则",
      "强制执行分支保护规则：必须评审、CI 通过、禁止直接推送至主分支/主干",
      "保持特性分支短命（数小时到数天），以最小化合并冲突和集成风险",
      "通过拉取请求合并至主线，需通过自动化测试、代码评审和所有必要审批"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Branch",
      "Develop",
      "Review",
      "Merge"
    ],
    "viz_labels_zh": [
      "功能分支",
      "开发",
      "代码审查",
      "合并"
    ],
    "related": [
      "gitops",
      "feature-flags",
      "branch-by-abstraction"
    ],
    "tags": [
      "git",
      "branching",
      "gitflow",
      "trunk-based",
      "version-control"
    ],
    "origin_author": "Vincent Driessen (GitFlow, 2010); various (trunk-based development, ~2000s)",
    "origin_source": "A Successful Git Branching Model (Vincent Driessen, 2010); trunkbaseddevelopment.com (Paul Hammant)",
    "origin_source_zh": "《一个成功的 Git 分支模型》（Vincent Driessen，2010）；trunkbaseddevelopment.com（Paul Hammant）",
    "complexity": "beginner",
    "when_to_use": [
      "Teams collaborating on a shared codebase that need a structured workflow for parallel development",
      "Projects with formal release cycles requiring release branches and hotfix processes",
      "Open-source projects where external contributors submit changes via pull requests from forks",
      "Organizations enforcing code review and CI checks before any code reaches the mainline"
    ],
    "when_to_use_zh": [
      "在共享代码库上协作、需要结构化并行开发工作流的团队",
      "需要发布分支和热修复流程的正式发布周期项目",
      "外部贡献者通过 Fork 提交拉取请求的开源项目",
      "在代码进入主线前强制执行代码评审和 CI 检查的组织"
    ],
    "core_concepts": [
      "Feature Branch: A short-lived branch created for a single feature or fix, isolated from mainline until ready to merge",
      "Trunk-Based Development: A model where all developers commit to a single trunk (main) at least daily, using feature flags for incomplete work",
      "GitFlow: A structured model with develop, feature, release, and hotfix branches suited to versioned software with scheduled releases",
      "Branch Protection: Server-side rules that prevent direct pushes and enforce quality gates (reviews, CI) before merge"
    ],
    "core_concepts_zh": [
      "特性分支：为单个功能或修复创建的短命分支，在准备合并前与主线隔离",
      "主干开发：所有开发者至少每天向单一主干（main）提交，使用特性开关管理未完成工作",
      "GitFlow：带有 develop、feature、release 和 hotfix 分支的结构化模型，适合有计划发布的版本化软件",
      "分支保护：服务端规则，防止直接推送并在合并前强制执行质量门控（评审、CI）"
    ],
    "timeline": [
      [
        "2005",
        "Git released by Linus Torvalds, enabling lightweight branching that makes feature branches practical"
      ],
      [
        "2010",
        "Vincent Driessen publishes 'A Successful Git Branching Model' (GitFlow), defining the most influential branching strategy"
      ],
      [
        "2011",
        "GitHub Flow emerges as a simpler alternative: branch, commit, PR, deploy, merge"
      ],
      [
        "2017",
        "Trunk-based development gains momentum as Google and Facebook advocate for short-lived branches and feature flags"
      ],
      [
        "2023",
        "DORA research confirms trunk-based development with short-lived branches correlates with elite DevOps performance"
      ]
    ],
    "timeline_zh": [
      [
        "2005",
        "Linus Torvalds 发布 Git，轻量级分支使特性分支变得实用"
      ],
      [
        "2010",
        "Vincent Driessen 发表《一个成功的 Git 分支模型》（GitFlow），定义了最具影响力的分支策略"
      ],
      [
        "2011",
        "GitHub Flow 作为更简单的替代方案出现：分支、提交、PR、部署、合并"
      ],
      [
        "2017",
        "随着 Google 和 Facebook 倡导短命分支和特性开关，主干开发获得发展动力"
      ],
      [
        "2023",
        "DORA 研究证实主干开发配合短命分支与精英级 DevOps 表现正相关"
      ]
    ],
    "dos": [
      "Keep feature branches as short-lived as possible — ideally merging within 1-2 days to reduce integration risk",
      "Rebase or merge mainline into feature branches daily to detect conflicts early and keep branches current",
      "Use branch naming conventions (feature/, fix/, chore/) to communicate intent and enable automation",
      "Delete branches immediately after merging to keep the repository clean and avoid stale branch accumulation"
    ],
    "dos_zh": [
      "尽量保持特性分支短命——理想情况下在 1-2 天内合并以减少集成风险",
      "每天将主线变基或合并到特性分支中，尽早发现冲突并保持分支最新",
      "使用分支命名规范（feature/、fix/、chore/）传达意图并支持自动化",
      "合并后立即删除分支以保持仓库整洁，避免陈旧分支堆积"
    ],
    "donts": [
      "Don't let feature branches live for weeks — long-lived branches lead to painful merge conflicts and integration surprises",
      "Don't use GitFlow for continuous deployment — its release branch ceremony adds overhead when you deploy multiple times daily",
      "Don't skip CI on feature branches — bugs found after merge are much more expensive to fix than bugs caught on the branch",
      "Don't merge without code review — even small changes benefit from a second pair of eyes for knowledge sharing and quality"
    ],
    "donts_zh": [
      "不要让特性分支存活数周——长命分支导致痛苦的合并冲突和集成意外",
      "不要在持续部署场景使用 GitFlow——当每天多次部署时，发布分支仪式会增加开销",
      "不要跳过特性分支上的 CI——合并后发现的 Bug 修复成本远高于在分支上发现的",
      "不要在没有代码评审的情况下合并——即使小变更也能从第二双眼睛中获益，促进知识共享和质量提升"
    ],
    "case_study_company": "Google",
    "case_study": "Google operates one of the largest monorepos in the world with over 80 million commits, using a trunk-based development model. All 25,000+ engineers commit to a single trunk, with changes gated by automated testing and code review. Feature flags manage incomplete work, and short-lived branches (typically hours) ensure continuous integration. This approach enables Google to deploy thousands of changes per day while maintaining code health at scale.",
    "case_study_zh": "Google 运营着全球最大的单体仓库之一，拥有超过 8000 万次提交，采用主干开发模型。所有 25000 多名工程师向单一主干提交，变更通过自动化测试和代码评审门控。特性开关管理未完成工作，短命分支（通常为数小时）确保持续集成。这种方法使 Google 能够每天部署数千项变更，同时大规模维护代码健康。",
    "when_not_to_use": [
      "Solo developers or very small teams where branching overhead exceeds its coordination benefits",
      "Experimental or research codebases where rigid branching rules slow down exploration",
      "Projects with no CI/CD pipeline where branch-based quality gates cannot be enforced automatically",
      "Monorepo setups that require specialized merge strategies beyond standard Git branching"
    ],
    "when_not_to_use_zh": [
      "单人开发者或极小团队——分支开销超过其协调收益",
      "严格分支规则会减慢探索速度的实验或研究代码库",
      "没有 CI/CD 流水线、无法自动执行分支质量门控的项目",
      "需要超越标准 Git 分支的专用合并策略的单体仓库设置"
    ],
    "adopters": [
      "Google",
      "Facebook (Meta)",
      "GitHub",
      "Microsoft",
      "Atlassian"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Vincent Driessen (2010). \"A Successful Git Branching Model\". nvie.com.",
    "secondary_sources": [
      "Paul Hammant (2017). \"Trunk Based Development\". trunkbaseddevelopment.com.",
      "Jez Humble and David Farley (2010). \"Continuous Delivery: Reliable Software Releases through Build, Test, and Deployment Automation\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "gitops",
        "type": "complement"
      },
      {
        "slug": "feature-flags",
        "type": "complement"
      },
      {
        "slug": "branch-by-abstraction",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 189,
    "name": "Deprecation Strategy",
    "name_zh": "弃用策略",
    "slug": "deprecation-strategy",
    "category": "evolution",
    "desc": "Systematic approach to retiring old APIs/features",
    "desc_zh": "系统性地退役旧 API 和功能的方法",
    "steps": [
      "Announce deprecation with a clear timeline, migration guide, and sunset date communicated through multiple channels",
      "Provide a migration path: new API, data export tools, or replacement feature that covers existing use cases",
      "Add runtime deprecation warnings (HTTP headers, log messages, SDK warnings) to track usage of deprecated features",
      "Monitor deprecation metrics: track the decline in deprecated feature usage and identify blocking consumers",
      "Remove the deprecated feature on the sunset date, with a final grace period and direct outreach to remaining consumers"
    ],
    "steps_zh": [
      "通过多渠道宣布弃用，明确时间线、迁移指南和日落日期",
      "提供迁移路径：覆盖现有用例的新 API、数据导出工具或替代功能",
      "添加运行时弃用警告（HTTP 头、日志消息、SDK 警告）以跟踪弃用功能的使用情况",
      "监控弃用指标：跟踪弃用功能使用量的下降并识别阻塞的消费者",
      "在日落日期移除弃用功能，设置最终宽限期并直接联系剩余消费者"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Announce",
      "Warn",
      "Sunset",
      "Remove"
    ],
    "viz_labels_zh": [
      "宣告废弃",
      "警告期",
      "日落期",
      "移除"
    ],
    "related": [
      "api-versioning-strategies",
      "strangler-fig-pattern",
      "semantic-versioning"
    ],
    "tags": [
      "deprecation",
      "api-lifecycle",
      "sunset",
      "migration",
      "backward-compatibility"
    ],
    "origin_author": "Industry practice formalized across Google, Stripe, and major API providers, ~2010s",
    "origin_source": "API Design Patterns (JJ Geewax, Manning 2021); Google API Improvement Proposals (AIP-181: Deprecation)",
    "origin_source_zh": "《API 设计模式》（JJ Geewax，Manning 2021）；Google API 改进提案（AIP-181：弃用）",
    "complexity": "intermediate",
    "when_to_use": [
      "Public APIs with external consumers who need advance notice and migration support",
      "Internal platforms where multiple teams depend on shared services that need to evolve",
      "SDK and library maintainers who must remove features without breaking downstream consumers",
      "Organizations accumulating technical debt from features that should have been retired years ago"
    ],
    "when_to_use_zh": [
      "拥有需要提前通知和迁移支持的外部消费者的公共 API",
      "多个团队依赖需要演进的共享服务的内部平台",
      "必须在不破坏下游消费者的情况下移除功能的 SDK 和库维护者",
      "因多年未退役的功能而积累技术债的组织"
    ],
    "core_concepts": [
      "Sunset Date: A published, non-negotiable date after which the deprecated feature will be removed entirely",
      "Migration Path: A documented, tested alternative that covers all use cases of the deprecated feature",
      "Deprecation Warning: Runtime signals (HTTP Sunset header, compiler warnings, log messages) that alert consumers to upcoming removal",
      "Usage Tracking: Monitoring deprecated feature consumption to identify migration stragglers and measure progress toward sunset"
    ],
    "core_concepts_zh": [
      "日落日期：已公布的、不可协商的日期，之后弃用功能将被完全移除",
      "迁移路径：已记录、已测试的替代方案，覆盖弃用功能的所有用例",
      "弃用警告：提醒消费者即将移除的运行时信号（HTTP Sunset 头、编译器警告、日志消息）",
      "使用量跟踪：监控弃用功能的消费情况，以识别迁移滞后者并衡量日落进度"
    ],
    "timeline": [
      [
        "2011",
        "Google establishes formal API deprecation policies with multi-year sunset timelines for Google APIs"
      ],
      [
        "2015",
        "Stripe pioneers developer-friendly deprecation with versioned API, migration guides, and dashboard warnings"
      ],
      [
        "2018",
        "HTTP Sunset header (RFC 8594) standardized, giving APIs a machine-readable deprecation signal"
      ],
      [
        "2021",
        "JJ Geewax covers deprecation patterns in API Design Patterns (Manning), formalizing industry best practices"
      ],
      [
        "2024",
        "AI-assisted migration tools emerge, using LLMs to auto-generate code changes for deprecated API consumers"
      ]
    ],
    "timeline_zh": [
      [
        "2011",
        "Google 建立正式的 API 弃用策略，为 Google API 设定多年期日落时间线"
      ],
      [
        "2015",
        "Stripe 率先推出开发者友好的弃用方式——版本化 API、迁移指南和控制台警告"
      ],
      [
        "2018",
        "HTTP Sunset 头（RFC 8594）标准化，为 API 提供机器可读的弃用信号"
      ],
      [
        "2021",
        "JJ Geewax 在《API 设计模式》（Manning）中涵盖弃用模式，正式化行业最佳实践"
      ],
      [
        "2024",
        "AI 辅助迁移工具出现，使用大模型自动生成弃用 API 消费者的代码变更"
      ]
    ],
    "dos": [
      "Announce deprecation at least 6-12 months before sunset for public APIs — more for critical infrastructure",
      "Provide migration tooling (codemods, scripts, SDK helpers) that automate as much of the migration as possible",
      "Use the HTTP Sunset header (RFC 8594) to give API consumers a machine-readable deprecation signal",
      "Track deprecated feature usage metrics on a dashboard and proactively reach out to top consumers"
    ],
    "dos_zh": [
      "公共 API 至少在日落前 6-12 个月宣布弃用——关键基础设施需要更长时间",
      "提供迁移工具（代码修改器、脚本、SDK 辅助工具）尽可能自动化迁移",
      "使用 HTTP Sunset 头（RFC 8594）向 API 消费者提供机器可读的弃用信号",
      "在仪表板上跟踪弃用功能使用指标并主动联系主要消费者"
    ],
    "donts": [
      "Don't deprecate without providing a clear, documented migration path — deprecation without an alternative is abandonment",
      "Don't silently remove features — always announce, warn at runtime, and give consumers time to migrate",
      "Don't extend sunset dates repeatedly — this trains consumers to ignore deprecation notices entirely",
      "Don't deprecate too many things at once — migration fatigue causes consumers to disengage and fall behind"
    ],
    "donts_zh": [
      "不要在没有提供清晰、已记录的迁移路径的情况下弃用——没有替代方案的弃用是遗弃",
      "不要静默移除功能——始终宣布、运行时警告并给消费者迁移时间",
      "不要反复延长日落日期——这会训练消费者完全忽略弃用通知",
      "不要一次弃用太多东西——迁移疲劳会导致消费者脱离并落后"
    ],
    "case_study_company": "Stripe",
    "case_study": "Stripe manages one of the most developer-friendly deprecation processes in the industry. Every API version is supported for years, with clear deprecation timelines published in their API changelog. When deprecating the 2019-02-19 API version, Stripe provided per-account migration dashboards showing exactly which endpoints needed updating, automated test mode switching to the new version, and dedicated migration support for high-volume merchants. This approach achieved 98% voluntary migration before the sunset date.",
    "case_study_zh": "Stripe 管理着业界最开发者友好的弃用流程之一。每个 API 版本支持数年，在 API 变更日志中发布清晰的弃用时间线。弃用 2019-02-19 API 版本时，Stripe 提供了每账户迁移仪表板，精确显示哪些端点需要更新，自动将测试模式切换至新版本，并为高流量商户提供专属迁移支持。这种方式在日落日期前实现了 98% 的自愿迁移。",
    "when_not_to_use": [
      "Internal-only APIs with a single consumer team that can coordinate changes directly without formal deprecation",
      "Experimental or alpha APIs where consumers understand features may be removed without notice",
      "One-time migration projects where the old system will be fully decommissioned in a single cutover",
      "APIs with zero external consumers where the overhead of formal deprecation exceeds the coordination benefit"
    ],
    "when_not_to_use_zh": [
      "只有单一消费团队的内部 API——可以直接协调变更而无需正式弃用",
      "消费者理解功能可能不经通知即被移除的实验性或 Alpha API",
      "旧系统将在单次切换中完全退役的一次性迁移项目",
      "没有外部消费者、正式弃用开销超过协调收益的 API"
    ],
    "adopters": [
      "Stripe",
      "Google",
      "Twilio",
      "GitHub",
      "AWS"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "JJ Geewax (2021). \"API Design Patterns\". Manning Publications.",
    "secondary_sources": [
      "Google (2020). \"AIP-181: Field Deprecation\". google.aip.dev.",
      "Stripe (2017). \"API Versioning at Stripe\". stripe.com/blog."
    ],
    "typed_relations": [
      {
        "slug": "api-versioning-strategies",
        "type": "complement"
      },
      {
        "slug": "strangler-fig-pattern",
        "type": "complement"
      },
      {
        "slug": "semantic-versioning",
        "type": "complement"
      }
    ]
  },
  {
    "id": 190,
    "name": "Living Documentation",
    "name_zh": "活文档",
    "slug": "living-documentation",
    "category": "evolution",
    "desc": "Generate docs from tests and code (Martraire, 2019)",
    "desc_zh": "从测试和代码生成文档（Martraire，2019）",
    "steps": [
      "Write executable specifications (BDD scenarios, annotated tests) that serve as both tests and documentation",
      "Annotate domain code with metadata (custom annotations, docstrings, tags) that documentation generators can extract",
      "Configure automated documentation generation in the CI/CD pipeline, producing output on every commit",
      "Publish generated documentation to a developer portal or wiki that is always in sync with the latest code",
      "Review documentation coverage as part of code review — untested behavior is undocumented behavior"
    ],
    "steps_zh": [
      "编写可执行规范（BDD 场景、带注解的测试），同时充当测试和文档",
      "用元数据（自定义注解、文档字符串、标签）标注领域代码，供文档生成器提取",
      "在 CI/CD 流水线中配置自动化文档生成，每次提交都产生输出",
      "将生成的文档发布到与最新代码始终同步的开发者门户或 Wiki",
      "将文档覆盖率作为代码评审的一部分——未测试的行为就是未记录的行为"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Code",
      "Generate",
      "Publish",
      "Review"
    ],
    "viz_labels_zh": [
      "代码",
      "生成文档",
      "发布",
      "审阅"
    ],
    "related": [
      "bdd",
      "domain-driven-design",
      "gitops",
      "openapi-specification"
    ],
    "tags": [
      "documentation",
      "living-docs",
      "bdd",
      "executable-specs",
      "code-as-documentation"
    ],
    "origin_author": "Cyrille Martraire, 2019",
    "origin_source": "Living Documentation: Continuous Knowledge Sharing by Design (Cyrille Martraire, Addison-Wesley 2019)",
    "origin_source_zh": "《活文档：通过设计实现持续知识共享》（Cyrille Martraire，Addison-Wesley 2019）",
    "complexity": "intermediate",
    "when_to_use": [
      "Projects where documentation is perpetually outdated because it is maintained separately from code",
      "Domain-rich applications where business rules embedded in code need to be visible to non-developers",
      "Teams practicing BDD or specification-by-example that want to maximize the value of their executable specs",
      "APIs and libraries where consumers need always-current reference documentation"
    ],
    "when_to_use_zh": [
      "文档因与代码分开维护而长期过时的项目",
      "嵌入代码中的业务规则需要对非开发人员可见的领域丰富应用",
      "实践 BDD 或示例规约、希望最大化可执行规范价值的团队",
      "消费者需要始终最新参考文档的 API 和库"
    ],
    "core_concepts": [
      "Executable Specification: Tests that are written in a human-readable format (Gherkin, annotated unit tests) and double as system documentation",
      "Single Source of Truth: Documentation is generated from code and tests, eliminating the drift between what the docs say and what the code does",
      "Knowledge Augmentation: Annotations and naming conventions in code carry domain knowledge that documentation generators can harvest automatically",
      "Evergreen Documentation: Because docs are regenerated on every CI build, they are always current — stale documentation is structurally impossible"
    ],
    "core_concepts_zh": [
      "可执行规范：以人类可读格式（Gherkin、带注解的单元测试）编写的测试，同时充当系统文档",
      "唯一事实来源：文档从代码和测试生成，消除文档描述与代码行为之间的偏差",
      "知识增强：代码中的注解和命名规范携带领域知识，文档生成器可以自动提取",
      "常青文档：因为每次 CI 构建都重新生成文档，所以文档始终是最新的——结构上不可能出现过时文档"
    ],
    "timeline": [
      [
        "2011",
        "Specification by Example (Gojko Adzic) promotes executable specs as the bridge between requirements and tests"
      ],
      [
        "2015",
        "Tools like Serenity BDD and Pickles generate living documentation from Cucumber/Gherkin scenarios"
      ],
      [
        "2019",
        "Cyrille Martraire publishes Living Documentation, the definitive book on generating docs from code and tests"
      ],
      [
        "2021",
        "OpenAPI/Swagger generators and Storybook become mainstream, making API and UI living documentation standard practice"
      ],
      [
        "2024",
        "LLM-powered documentation generators auto-summarize code changes and produce natural-language docs from test suites"
      ]
    ],
    "timeline_zh": [
      [
        "2011",
        "《示例规约》（Gojko Adzic）推广可执行规范作为需求与测试之间的桥梁"
      ],
      [
        "2015",
        "Serenity BDD 和 Pickles 等工具从 Cucumber/Gherkin 场景生成活文档"
      ],
      [
        "2019",
        "Cyrille Martraire 出版《活文档》，成为从代码和测试生成文档的权威著作"
      ],
      [
        "2021",
        "OpenAPI/Swagger 生成器和 Storybook 成为主流，使 API 和 UI 活文档成为标准实践"
      ],
      [
        "2024",
        "LLM 驱动的文档生成器自动总结代码变更，从测试套件生成自然语言文档"
      ]
    ],
    "dos": [
      "Name tests and specifications using domain language so generated docs are meaningful to non-technical stakeholders",
      "Integrate documentation generation into CI/CD so docs are rebuilt and published on every merge to main",
      "Use BDD/Gherkin scenarios for business-critical flows — they serve as both acceptance tests and stakeholder-readable docs",
      "Treat documentation coverage as a quality metric alongside test coverage"
    ],
    "dos_zh": [
      "使用领域语言命名测试和规范，使生成的文档对非技术利益相关者有意义",
      "将文档生成集成到 CI/CD 中，使文档在每次合并到主分支时重建和发布",
      "对业务关键流程使用 BDD/Gherkin 场景——它们既是验收测试又是利益相关者可读的文档",
      "将文档覆盖率与测试覆盖率一样作为质量指标"
    ],
    "donts": [
      "Don't maintain separate documentation that duplicates what tests already describe — it will inevitably drift",
      "Don't generate documentation that nobody reads — validate that the output format and location match how consumers access docs",
      "Don't write tests solely for documentation purposes — every executable spec must also be a real, meaningful test",
      "Don't ignore the readability of generated docs — invest in formatting, navigation, and search for the output"
    ],
    "donts_zh": [
      "不要维护与测试描述重复的独立文档——它们必然会偏差",
      "不要生成没人阅读的文档——验证输出格式和位置是否符合消费者访问文档的方式",
      "不要仅为了文档目的编写测试——每个可执行规范必须同时是真正有意义的测试",
      "不要忽略生成文档的可读性——为输出投入格式化、导航和搜索功能"
    ],
    "case_study_company": "Société Générale",
    "case_study": "Société Générale's Corporate & Investment Banking division adopted living documentation for their trade processing platform, generating business-readable documentation from 3,000+ BDD scenarios. Business analysts could review the generated docs to verify that regulatory rules were correctly implemented without reading code. This approach reduced documentation maintenance effort by 60% and eliminated compliance audit findings related to documentation-code discrepancies.",
    "case_study_zh": "法国兴业银行企业与投资银行部门为其交易处理平台采用活文档，从 3000 多个 BDD 场景生成业务可读文档。业务分析师可以审阅生成的文档来验证监管规则是否正确实现而无需阅读代码。这种方法减少了 60% 的文档维护工作量，并消除了与文档-代码不一致相关的合规审计发现。",
    "when_not_to_use": [
      "Projects with minimal domain complexity where simple README files are sufficient documentation",
      "Teams without existing test automation — living documentation requires a mature test suite as its foundation",
      "Contexts where the primary audience is end-users, not developers or analysts — user-facing docs require different formats",
      "Codebases in rapid prototyping phases where the code itself is too volatile for generated docs to be useful"
    ],
    "when_not_to_use_zh": [
      "领域复杂度极低、简单 README 文件即可满足文档需求的项目",
      "没有现有测试自动化的团队——活文档需要成熟的测试套件作为基础",
      "主要受众是终端用户而非开发者或分析师的场景——面向用户的文档需要不同格式",
      "处于快速原型阶段、代码本身变化太快导致生成文档无用的代码库"
    ],
    "adopters": [
      "Société Générale",
      "Spotify",
      "ThoughtWorks",
      "GOV.UK",
      "Zalando"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "testability"
    ],
    "maturity_ring": "established",
    "primary_source": "Cyrille Martraire (2019). \"Living Documentation: Continuous Knowledge Sharing by Design\". Addison-Wesley.",
    "secondary_sources": [
      "Gojko Adzic (2011). \"Specification by Example: How Successful Teams Deliver the Right Software\". Manning Publications.",
      "Dan North (2006). \"Introducing BDD\". dannorth.net."
    ],
    "typed_relations": [
      {
        "slug": "bdd",
        "type": "complement"
      },
      {
        "slug": "domain-driven-design",
        "type": "complement"
      },
      {
        "slug": "gitops",
        "type": "complement"
      },
      {
        "slug": "openapi-specification",
        "type": "complement"
      }
    ]
  },
  {
    "id": 245,
    "name": "Fitness Function-Driven Development",
    "name_zh": "适应度函数驱动开发",
    "slug": "fitness-function-driven-development",
    "category": "evolution",
    "desc": "Automated architecture governance using executable fitness functions that continuously verify architectural characteristics are preserved during system evolution",
    "desc_zh": "使用可执行适应度函数进行自动化架构治理，持续验证架构特性在系统演进过程中得以保留",
    "steps": [
      "Identify the architectural characteristics (coupling thresholds, cyclic dependency limits, performance SLOs, security compliance rules) that must be preserved as the system evolves, drawing from ADRs, architecture principles documents, and non-functional requirements",
      "Implement each characteristic as an executable fitness function: coupling metrics as JDepend/ArchUnit rules, performance SLOs as load test assertions in CI, security posture as automated vulnerability scan thresholds, and data residency as policy-as-code rules",
      "Classify fitness functions by execution cadence: atomic (run on every commit in the CI pipeline), triggered (run on a schedule or deployment event), and continual (run as production synthetic monitors)",
      "Integrate fitness function results into the deployment pipeline as hard gates — failing fitness functions block promotion to the next environment, treating architectural violations as build failures",
      "Review and evolve fitness function thresholds at each architectural review: retiring obsolete functions, tightening thresholds as the system matures, and adding new functions when new architectural risks are identified"
    ],
    "steps_zh": [
      "识别系统演进中必须保留的架构特性（耦合阈值、循环依赖限制、性能 SLO、安全合规规则），从 ADR、架构原则文档和非功能性需求中提取",
      "将每个特性实现为可执行的适应度函数：耦合指标作为 JDepend/ArchUnit 规则、性能 SLO 作为 CI 中的负载测试断言、安全态势作为自动化漏洞扫描阈值、数据驻留作为策略即代码规则",
      "按执行频率对适应度函数分类：原子型（每次提交在 CI 流水线中运行）、触发型（按计划或部署事件运行）和持续型（作为生产合成监控运行）",
      "将适应度函数结果作为硬性门禁集成到部署流水线中——适应度函数失败会阻止晋升到下一个环境，将架构违规视为构建失败",
      "在每次架构审查时审视并演进适应度函数阈值：退役过时函数、随系统成熟收紧阈值，以及在识别到新架构风险时添加新函数"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Define",
      "Automate",
      "Measure",
      "Evolve"
    ],
    "viz_labels_zh": [
      "定义函数",
      "自动化",
      "度量",
      "架构演化"
    ],
    "related": [
      "adr-y-statements",
      "living-documentation",
      "strangler-fig-pattern"
    ],
    "tags": [
      "evolutionary-architecture",
      "fitness-functions",
      "architecture-governance",
      "ci-cd",
      "architectural-testing"
    ],
    "origin_author": "Neal Ford",
    "origin_source": "Ford, N., Parsons, R. & Kua, P. (2017). \"Building Evolutionary Architectures\". O'Reilly Media; ThoughtWorks Technology Radar (fitness functions entry)",
    "origin_source_zh": "Ford, N.、Parsons, R. 与 Kua, P.（2017）。《演进式架构》，O'Reilly；ThoughtWorks 技术雷达（适应度函数条目）",
    "complexity": "advanced",
    "when_to_use": [
      "When architectural drift is a recurring problem — teams unknowingly introduce circular dependencies, performance regressions, or coupling violations during feature development",
      "When architecture principles exist in documents but are not enforced — fitness functions translate human-readable principles into machine-checkable rules",
      "When migrating a monolith to microservices and you need guardrails to prevent the new architecture from regressing toward the same coupling patterns",
      "When operating in regulated industries where compliance rules (data residency, encryption at rest, audit logging) must be continuously verified rather than periodically audited"
    ],
    "when_to_use_zh": [
      "当架构漂移是反复出现的问题时——团队在功能开发中无意引入循环依赖、性能退化或耦合违规",
      "当架构原则存在于文档中但未被执行时——适应度函数将人类可读的原则转化为机器可检查的规则",
      "当将单体迁移到微服务时，需要护栏防止新架构退回到相同的耦合模式",
      "当在受监管行业运营，合规规则（数据驻留、静态加密、审计日志）必须持续验证而非定期审计时"
    ],
    "core_concepts": [
      "Fitness Function: An objective, executable function that evaluates a specific architectural characteristic — analogous to a test that targets architecture rather than behavior, running automatically in the CI/CD pipeline",
      "Architectural Characteristics: The '-ilities' (maintainability, scalability, security, performance, deployability) that define the system's structural quality and must be preserved as the codebase evolves",
      "Incremental Change with Feedback: Fitness functions enable teams to make small, frequent changes to the architecture while receiving immediate automated feedback when those changes violate architectural principles",
      "Guided Evolution: The set of active fitness functions defines the 'fitness landscape' — the boundary within which the architecture is free to evolve while still meeting its non-functional requirements",
      "Cyclic Dependency Detection: One of the most common and valuable fitness functions is automated detection of package/module cyclic dependencies (using ArchUnit, Dependency Cruiser, or NDepend), which are a leading indicator of architectural decay"
    ],
    "core_concepts_zh": [
      "适应度函数：评估特定架构特性的客观可执行函数——类似于针对架构而非行为的测试，在 CI/CD 流水线中自动运行",
      "架构特性：定义系统结构质量且必须在代码库演进中保留的各种「-ility」（可维护性、可扩展性、安全性、性能、可部署性）",
      "带反馈的增量变更：适应度函数使团队能对架构进行小而频繁的变更，同时在违反架构原则时立即获得自动化反馈",
      "引导式演进：活跃适应度函数的集合定义了「适应度景观」——架构在仍满足非功能性需求的前提下可自由演进的边界",
      "循环依赖检测：最常见且最有价值的适应度函数之一是自动检测包/模块循环依赖（使用 ArchUnit、Dependency Cruiser 或 NDepend），这是架构衰退的领先指标"
    ],
    "timeline": [
      [
        "2009",
        "Michael Feathers coins the term 'architectural fitness function' in discussions on evolutionary design at software conferences"
      ],
      [
        "2017",
        "Neal Ford, Rebecca Parsons, and Patrick Kua publish 'Building Evolutionary Architectures', formalizing fitness functions as the primary mechanism for governed architecture evolution"
      ],
      [
        "2019",
        "ArchUnit (Java), NetArchTest (.NET), and Dependency Cruiser (JS) mature into production-grade architectural testing libraries, making fitness functions accessible to mainstream teams"
      ],
      [
        "2023",
        "AI-assisted architecture analysis tools begin generating fitness function suggestions from codebase scans, lowering the barrier to adoption"
      ]
    ],
    "timeline_zh": [
      [
        "2009",
        "Michael Feathers 在软件会议的演进式设计讨论中创造了「架构适应度函数」这一术语"
      ],
      [
        "2017",
        "Neal Ford、Rebecca Parsons 和 Patrick Kua 出版《演进式架构》，将适应度函数正式确立为受治理架构演进的主要机制"
      ],
      [
        "2019",
        "ArchUnit（Java）、NetArchTest（.NET）和 Dependency Cruiser（JS）发展为生产级架构测试库，使适应度函数可为主流团队所用"
      ],
      [
        "2023",
        "AI 辅助的架构分析工具开始从代码库扫描中生成适应度函数建议，降低了采用门槛"
      ]
    ],
    "dos": [
      "Do start with the highest-risk architectural characteristics first — coupling thresholds and dependency direction are quick wins that immediately surface structural decay",
      "Do treat fitness function failures as build failures with the same urgency as failing unit tests — an architectural violation that is not blocked becomes a permanent regression",
      "Do version-control fitness function rules alongside the production code so that threshold changes are reviewed and auditable in pull requests",
      "Do run atomic fitness functions in the CI pipeline (seconds to minutes) and reserve heavier functions (load tests, full security scans) for scheduled or pre-release gates to avoid developer workflow friction"
    ],
    "dos_zh": [
      "从最高风险的架构特性开始——耦合阈值和依赖方向是能立即显现结构衰退的快速收益",
      "像对待单元测试失败一样紧急处理适应度函数失败——未被阻止的架构违规将成为永久性退化",
      "将适应度函数规则与生产代码一起版本控制，使阈值变更在拉取请求中可审查和可追溯",
      "在 CI 流水线中运行原子适应度函数（几秒到几分钟），将较重的函数（负载测试、完整安全扫描）保留给计划任务或发布前门禁，避免影响开发者工作流"
    ],
    "donts": [
      "Don't create fitness functions for trivial or cosmetic concerns (code style, comment density) — fitness functions are for architectural invariants, not style guides, which belong in linters",
      "Don't set thresholds so strict they fail on day one — start permissive to establish a baseline and tighten incrementally; a fitness function that always fails is ignored",
      "Don't silently suppress fitness function failures in the pipeline without a documented exception process — unchecked suppressions accumulate into untracked architectural debt",
      "Don't make all fitness functions atomic/blocking — over-gating the CI pipeline with slow checks destroys developer productivity; classify and run at the appropriate cadence"
    ],
    "donts_zh": [
      "不要为琐碎或表面问题（代码风格、注释密度）创建适应度函数——适应度函数用于架构不变量，而非应属于代码检查工具的风格指南",
      "不要设置一开始就会失败的过于严格的阈值——从宽松开始建立基线，然后逐步收紧；总是失败的适应度函数会被忽视",
      "不要在没有文档化例外流程的情况下悄然在流水线中抑制适应度函数失败——未受检查的抑制会积累成未跟踪的架构债务",
      "不要让所有适应度函数都是原子/阻塞的——用缓慢检查过度拦截 CI 流水线会破坏开发者生产力；按适当频率分类运行"
    ],
    "case_study_company": "ThoughtWorks",
    "case_study": "ThoughtWorks applied fitness function-driven development on a large-scale financial services platform migration from a 10-year-old monolith to a microservices architecture. The team implemented 47 fitness functions covering coupling thresholds (maximum 5 inbound dependencies per service), performance SLOs (p95 < 200ms for payment APIs), data residency (no EU personal data in non-EU regions), and security posture (no high-severity CVEs in container images). Fitness functions caught 12 architectural violations during the 18-month migration that would have required costly rollbacks if discovered in production, including a circular dependency chain between 5 services that emerged when two squads independently refactored their service boundaries in the same sprint.",
    "case_study_zh": "ThoughtWorks 在将一个有 10 年历史的单体应用迁移到微服务架构的大型金融服务平台项目中应用了适应度函数驱动开发。团队实施了 47 个适应度函数，涵盖耦合阈值（每个服务最多 5 个入站依赖）、性能 SLO（支付 API 的 p95 < 200ms）、数据驻留（欧盟个人数据不得存储于非欧盟地区）和安全态势（容器镜像中无高严重性 CVE）。在 18 个月的迁移过程中，适应度函数捕获了 12 个架构违规，如果在生产中才被发现将需要代价高昂的回滚，其中包括当两个小组在同一个冲刺中独立重构各自服务边界时出现的 5 个服务之间的循环依赖链。",
    "when_not_to_use": [
      "Early-stage startups or prototypes where the architecture is intentionally fluid and premature governance would constrain experimentation needed to find product-market fit",
      "Small codebases with a single team where direct architectural communication and lightweight code review substitute effectively for automated governance",
      "Systems at the end of their lifecycle where the investment in fitness function infrastructure will not be amortized before the system is replaced",
      "Teams without CI/CD pipelines — fitness functions require automated execution infrastructure; manual checks are not a viable substitute"
    ],
    "when_not_to_use_zh": [
      "早期初创公司或原型阶段，架构有意保持流动性，过早的治理会限制寻找产品市场契合所需的实验",
      "拥有单一团队的小型代码库，直接架构沟通和轻量级代码审查能有效替代自动化治理",
      "处于生命周期末期的系统，适应度函数基础设施的投资在系统被替换之前无法摊销",
      "没有 CI/CD 流水线的团队——适应度函数需要自动化执行基础设施；手动检查不是可行的替代方案"
    ],
    "adopters": [
      "ThoughtWorks",
      "Netflix",
      "Spotify",
      "Zalando",
      "Wealthfront"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "reliability",
      "testability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Ford, N., Parsons, R. & Kua, P. (2017). \"Building Evolutionary Architectures\". O'Reilly Media.",
    "secondary_sources": [
      "ThoughtWorks (2020). \"Fitness functions as architecture tests\". ThoughtWorks Insights.",
      "Kua, P. (2019). \"Evolutionary Architecture\". InfoQ conference presentation."
    ],
    "typed_relations": [
      {
        "slug": "adr-y-statements",
        "type": "complement"
      },
      {
        "slug": "living-documentation",
        "type": "complement"
      },
      {
        "slug": "strangler-fig-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 246,
    "name": "Expansion/Contraction Pattern",
    "name_zh": "扩展/收缩模式",
    "slug": "expansion-contraction-pattern",
    "category": "evolution",
    "desc": "Safe API and schema migration technique that introduces new capabilities before removing old ones, allowing all clients to migrate without downtime",
    "desc_zh": "安全的 API 和模式迁移技术，在移除旧能力之前引入新能力，让所有客户端无需停机即可完成迁移",
    "steps": [
      "Expansion phase: add the new API shape, field, or behavior alongside the existing one without removing anything; both old and new forms are simultaneously supported so no client is broken by the deployment",
      "Migrate consumers: update all known consumers (internal services, mobile clients, partner integrations) to use the new form, using telemetry to track the percentage of traffic still using the old form",
      "Validate migration completeness: confirm through API usage metrics and consumer-driven contract tests that traffic on the old form has dropped to zero or acceptable residual levels",
      "Contraction phase: remove the old form once all consumers have migrated; publish a deprecation notice with the planned removal date before contraction, providing the agreed sunset window",
      "Monitor after contraction: watch for unexpected 400/404 errors from any consumers that were missed in the migration audit, and maintain a rollback plan for the first 72 hours post-contraction"
    ],
    "steps_zh": [
      "扩展阶段：在保留现有 API 形态、字段或行为的同时添加新的形态，不删除任何内容；新旧两种形式同时支持，部署不会破坏任何客户端",
      "迁移消费者：更新所有已知消费者（内部服务、移动客户端、合作伙伴集成）以使用新形式，使用遥测跟踪仍在使用旧形式的流量占比",
      "验证迁移完整性：通过 API 使用指标和消费者驱动的契约测试确认旧形式的流量已降至零或可接受的残余水平",
      "收缩阶段：一旦所有消费者完成迁移，移除旧形式；在收缩之前发布带计划移除日期的废弃通知，提供约定的下线窗口",
      "收缩后监控：观察来自迁移审计中遗漏的消费者产生的意外 400/404 错误，并在收缩后最初 72 小时内维护回滚计划"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Expand",
      "Dual Support",
      "Migrate",
      "Contract"
    ],
    "viz_labels_zh": [
      "扩展阶段",
      "双写兼容",
      "迁移阶段",
      "收缩阶段"
    ],
    "related": [
      "strangler-fig-pattern",
      "api-versioning-strategies",
      "branch-by-abstraction"
    ],
    "tags": [
      "migration",
      "backward-compatibility",
      "api-evolution",
      "zero-downtime",
      "deprecation"
    ],
    "origin_author": "Sam Newman",
    "origin_source": "Newman, S. (2019). \"Monolith to Microservices\". O'Reilly Media, Ch. 3 (Splitting Apart the Database); Parallel Change pattern (Danilo Sato, martinfowler.com)",
    "origin_source_zh": "Newman, S.（2019）。《从单体到微服务》，O'Reilly，第 3 章（拆分数据库）；并行变更模式（Danilo Sato，martinfowler.com）",
    "complexity": "intermediate",
    "when_to_use": [
      "When renaming a database column, API field, or service endpoint that is consumed by multiple services or clients that cannot be updated atomically",
      "When changing the type or structure of a shared API contract (splitting a field into two, merging two endpoints) where a flag-day cutover would require coordinated deployment of multiple teams",
      "When migrating a database schema in a system that must remain online during the migration, requiring the application to read/write both old and new column formats simultaneously",
      "When a public API field needs to be removed after deprecation and you need a safe, observable migration path that does not require client downtime"
    ],
    "when_to_use_zh": [
      "当重命名被多个服务或客户端消费的数据库列、API 字段或服务端点，且这些客户端无法原子性更新时",
      "当更改共享 API 契约的类型或结构（将字段拆分为两个、合并两个端点），标志日切换需要多个团队协调部署时",
      "当在必须保持在线的系统中迁移数据库模式，要求应用同时读写新旧两种列格式时",
      "当公共 API 字段在废弃后需要被移除，你需要一个不要求客户端停机的安全、可观察的迁移路径时"
    ],
    "core_concepts": [
      "Parallel Change (Expand): The first phase adds the new form alongside the old without removing anything, creating a temporary period where both forms exist and are valid — a structural backward-compatibility guarantee",
      "Consumer Migration Window: The period between expansion and contraction during which all consumers are expected to migrate; this window's length is determined by consumer SLAs, communication lead time, and organizational change velocity",
      "Usage-Based Contraction Gating: Contraction should be triggered by data (zero or near-zero usage of the old form as confirmed by metrics) rather than by calendar date alone, reducing the risk of premature removal",
      "Database Dual-Write/Dual-Read: When expanding a database schema (splitting a column), the application must write to both old and new columns and read from the new column with fallback to old, until the old column is confirmed empty",
      "Contract Tests as Migration Proof: Consumer-driven contract tests (Pact) serve as automated proof that consumers have migrated to the new form; a green contract test suite is a precondition for the contraction phase"
    ],
    "core_concepts_zh": [
      "并行变更（扩展）：第一阶段在不删除旧形式的情况下添加新形式，创造一个新旧两种形式都存在且有效的临时期——一种结构性向后兼容性保证",
      "消费者迁移窗口：扩展和收缩之间的期间，期间所有消费者应完成迁移；该窗口长度由消费者 SLA、沟通提前期和组织变更速度决定",
      "基于使用量的收缩门禁：收缩应由数据触发（指标确认旧形式使用量为零或接近零），而非仅依据日历日期，降低过早移除的风险",
      "数据库双写/双读：当扩展数据库模式（拆分列）时，应用必须同时写入新旧两列，并从新列读取（回退到旧列），直到确认旧列为空",
      "契约测试作为迁移证明：消费者驱动的契约测试（Pact）作为消费者已迁移到新形式的自动化证明；契约测试套件通过是收缩阶段的前提条件"
    ],
    "timeline": [
      [
        "2010",
        "Martin Fowler describes 'Parallel Change' on martinfowler.com as a refactoring technique for safely changing API signatures"
      ],
      [
        "2015",
        "Danilo Sato formalizes the Parallel Change pattern with expand/migrate/contract phases as a named structural refactoring pattern"
      ],
      [
        "2019",
        "Sam Newman applies the expansion/contraction technique explicitly to database schema migrations in 'Monolith to Microservices', popularizing it for zero-downtime schema evolution"
      ],
      [
        "2022",
        "Consumer-driven contract testing tools (Pact, Spring Contract) integrate migration tracking, providing automated migration completeness verification before contraction"
      ]
    ],
    "timeline_zh": [
      [
        "2010",
        "Martin Fowler 在 martinfowler.com 上将「并行变更」描述为安全更改 API 签名的重构技术"
      ],
      [
        "2015",
        "Danilo Sato 将并行变更模式正式确立为具有扩展/迁移/收缩阶段的命名结构重构模式"
      ],
      [
        "2019",
        "Sam Newman 在《从单体到微服务》中将扩展/收缩技术明确应用于数据库模式迁移，推广了零停机模式演进"
      ],
      [
        "2022",
        "消费者驱动的契约测试工具（Pact、Spring Contract）集成了迁移跟踪，在收缩前提供自动化的迁移完整性验证"
      ]
    ],
    "dos": [
      "Do instrument both old and new API forms with request counters from day one of expansion, so you have data-driven confidence that migration is complete before contracting",
      "Do communicate the expansion and planned contraction date to all known consumers at the start of the expansion phase, giving them the full migration window to plan and execute updates",
      "Do write a consumer-driven contract test for the new form before contraction — a passing contract test is the strongest automated proof that a consumer has migrated",
      "Do maintain the dual-write/dual-read pattern in database migrations for at least one full deployment cycle after the last write to the old column, allowing rollback if unexpected consumers appear"
    ],
    "dos_zh": [
      "从扩展第一天起就为新旧两种 API 形式配置请求计数器，以便在收缩前有数据驱动的信心确认迁移已完成",
      "在扩展阶段开始时将扩展和计划收缩日期告知所有已知消费者，给予他们完整的迁移窗口来规划和执行更新",
      "在收缩前为新形式编写消费者驱动的契约测试——通过的契约测试是消费者已迁移的最强自动化证明",
      "在数据库迁移中，在最后一次写入旧列后至少保持一个完整部署周期的双写/双读模式，以便在出现意外消费者时可以回滚"
    ],
    "donts": [
      "Don't skip the instrumentation step and contract the old form based on assumed migration completion — unknown consumers using the old form will generate errors that are difficult to diagnose post-contraction",
      "Don't make the expansion and contraction happen in the same deployment — the consumer migration window must span at least one release cycle so consumers have time to discover and adopt the new form",
      "Don't remove the old form while any production traffic is still using it, even if it is less than 1% — unknown consumers may be internal monitoring scripts, scheduled jobs, or partner integrations with infrequent call patterns",
      "Don't use expansion/contraction for purely additive changes (adding a new optional field) — it is only necessary for changes where the old form will eventually be removed or is actively harmful"
    ],
    "donts_zh": [
      "不要跳过监控步骤，基于假设的迁移完成来收缩旧形式——使用旧形式的未知消费者将产生难以在收缩后诊断的错误",
      "不要让扩展和收缩发生在同一次部署中——消费者迁移窗口必须跨越至少一个发布周期，让消费者有时间发现并采用新形式",
      "不要在任何生产流量仍在使用旧形式时移除它，即使不足 1%——未知消费者可能是内部监控脚本、定时任务或调用模式不频繁的合作伙伴集成",
      "不要对纯新增变更（添加新的可选字段）使用扩展/收缩——它仅在旧形式最终将被移除或主动有害的变更中才有必要"
    ],
    "case_study_company": "Booking.com",
    "case_study": "Booking.com applied the expansion/contraction pattern to migrate their property search API from a legacy flat response model to a structured hierarchical model across 200+ internal consumer services. The expansion phase ran for 8 weeks with both response formats served simultaneously, instrumented with per-consumer call counters. Migration was tracked on a team dashboard showing each consumer's adoption percentage. Contraction was blocked until all 200+ consumers showed 100% usage of the new format, confirmed by automated contract tests. The migration completed without a single client-visible error and without requiring any coordinated deployment freeze, enabling 15 separate engineering teams to migrate on their own schedule.",
    "case_study_zh": "Booking.com 应用扩展/收缩模式，将其房源搜索 API 从旧版扁平响应模型迁移到结构化分层模型，涉及 200 多个内部消费者服务。扩展阶段运行了 8 周，两种响应格式同时提供服务，并配置了按消费者的调用计数器。迁移通过显示每个消费者采用百分比的团队仪表板进行跟踪。收缩被阻止，直到所有 200 多个消费者显示 100% 使用新格式（通过自动化契约测试确认）。迁移完成时没有任何客户端可见的错误，也不需要任何协调部署冻结，让 15 个独立工程团队按照自己的时间表完成迁移。",
    "when_not_to_use": [
      "Security-critical changes (removing a vulnerable endpoint, fixing an authentication bypass) that must be deployed immediately without a migration window",
      "Changes where the old and new forms cannot coexist due to resource constraints (storage cost of maintaining two column formats is prohibitive, or the old behavior is actively insecure)",
      "New feature additions with no existing consumers — expansion/contraction is a migration pattern for changing existing contracts, not for introducing new capabilities",
      "Environments where deployment velocity is so high (multiple deploys per hour) that migration windows can be extremely short and consumer migration happens naturally within hours"
    ],
    "when_not_to_use_zh": [
      "必须立即部署而不能有迁移窗口的安全关键变更（移除有漏洞的端点、修复认证绕过）",
      "由于资源限制新旧两种形式无法共存的变更（维护两种列格式的存储成本过高，或旧行为主动不安全）",
      "没有现有消费者的新功能添加——扩展/收缩是用于更改现有契约的迁移模式，而非引入新能力",
      "部署速度极高（每小时多次部署）以至于迁移窗口可以极短、消费者迁移在数小时内自然发生的环境"
    ],
    "adopters": [
      "Booking.com",
      "Stripe",
      "Netflix",
      "Etsy",
      "SoundCloud"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability",
      "reliability",
      "portability"
    ],
    "maturity_ring": "established",
    "primary_source": "Newman, S. (2019). \"Monolith to Microservices\". O'Reilly Media.",
    "secondary_sources": [
      "Sato, D. (2014). \"Parallel Change\". martinfowler.com.",
      "Fowler, M. (2012). \"Tolerant Reader\". martinfowler.com.",
      "Humble, J. & Farley, D. (2010). \"Continuous Delivery\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "strangler-fig-pattern",
        "type": "complement"
      },
      {
        "slug": "api-versioning-strategies",
        "type": "related"
      },
      {
        "slug": "branch-by-abstraction",
        "type": "related"
      }
    ]
  },
  {
    "id": 247,
    "name": "Architecture Decision Records (Y-Statements)",
    "name_zh": "架构决策记录（Y 型陈述）",
    "slug": "adr-y-statements",
    "category": "evolution",
    "desc": "Structured decision documentation using the Y-statement format to capture context, decision, and consequences of significant architectural choices",
    "desc_zh": "使用 Y 型陈述格式记录背景、决策和后果，结构化记录重大架构决策",
    "steps": [
      "Identify decisions that warrant recording: those that are architecturally significant (affect multiple components or teams), hard to reverse, driven by constraints that future developers may not know, or required for compliance/audit purposes",
      "Author the ADR using the Y-statement format: 'In the context of [situation], facing [concern], we decided [option], to achieve [quality], accepting [downside]' — filling each clause forces the author to make implicit reasoning explicit",
      "Document alternatives considered and their trade-offs in the ADR body, explaining why each alternative was rejected; this is as valuable as the chosen option for future readers trying to understand the decision space",
      "Assign status (Proposed, Accepted, Deprecated, Superseded) and link superseding ADRs — an ADR that is never deprecated creates a false impression that the decision is still in force",
      "Store ADRs as numbered Markdown files in the repository (e.g., docs/adr/0042-use-postgres-for-session-storage.md), adjacent to the code they describe, so they appear in git blame and code review contexts"
    ],
    "steps_zh": [
      "识别值得记录的决策：具有架构意义的（影响多个组件或团队）、难以逆转的、由未来开发者可能不知道的约束驱动的，或合规/审计目的所需的",
      "使用 Y 型陈述格式撰写 ADR：「在 [情境] 的背景下，面对 [关切]，我们决定 [选项]，以实现 [质量]，接受 [缺点]」——填写每个子句迫使作者将隐性推理显式化",
      "在 ADR 正文中记录考虑过的备选方案及其权衡，解释每个备选方案被拒绝的原因；这对于试图理解决策空间的未来读者来说与所选选项同样有价值",
      "分配状态（已提议、已接受、已废弃、已取代）并链接取代 ADR——从未被废弃的 ADR 会给人该决策仍然有效的错误印象",
      "将 ADR 作为带编号的 Markdown 文件存储在仓库中（如 docs/adr/0042-use-postgres-for-session-storage.md），与其描述的代码相邻，使其出现在 git blame 和代码审查上下文中"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Context",
      "Decision",
      "Consequences",
      "Status"
    ],
    "viz_labels_zh": [
      "背景",
      "决策",
      "后果",
      "状态"
    ],
    "related": [
      "fitness-function-driven-development",
      "living-documentation",
      "domain-driven-design"
    ],
    "tags": [
      "architecture",
      "decision-records",
      "documentation",
      "y-statements",
      "governance",
      "knowledge-management"
    ],
    "origin_author": "Olaf Zimmermann",
    "origin_source": "Zimmermann, O. (2011). \"Architectural Decision Capturing in Agile Projects\". IEEE Software; Nygard, M. (2011). \"Documenting Architecture Decisions\". thinkrelevance.com",
    "origin_source_zh": "Zimmermann, O.（2011）。「敏捷项目中的架构决策捕获」，IEEE Software；Nygard, M.（2011）。「记录架构决策」，thinkrelevance.com",
    "complexity": "beginner",
    "when_to_use": [
      "When onboarding new team members who need to understand why the current architecture looks the way it does, without having to reconstruct decisions from meeting notes or tribal knowledge",
      "When a significant architectural decision is being made that will constrain future options — the ADR authoring process forces structured reasoning before commitment",
      "When working in regulated industries (finance, healthcare) where architecture decisions must be auditable and traceable to requirements or compliance obligations",
      "When multiple teams share a platform and need a lightweight, asynchronous process for proposing and reviewing architectural decisions without requiring synchronous meetings"
    ],
    "when_to_use_zh": [
      "当新团队成员需要了解当前架构为何如此，而无需从会议记录或部落知识中重建决策时",
      "当正在做出将约束未来选项的重大架构决策时——ADR 撰写过程在承诺前强制进行结构化推理",
      "当在受监管行业（金融、医疗）工作，架构决策必须可审计并可追溯到需求或合规义务时",
      "当多个团队共享平台并需要一个轻量级、异步的流程来提议和审查架构决策，而不需要同步会议时"
    ],
    "core_concepts": [
      "Y-Statement Format: A sentence template that structures an ADR as: 'In the context of [use case/user story], facing [concern/force], we decided [option], to achieve [quality/goal], accepting [downside/risk]' — each clause is mandatory, preventing incomplete decision records",
      "Decision Status Lifecycle: ADRs move through Proposed (under discussion), Accepted (in force), Deprecated (still valid but no longer recommended), and Superseded (replaced by a newer ADR with a link) — the lifecycle makes the current validity of each decision explicit",
      "Alternatives Considered: The most valuable section for future readers; documenting rejected alternatives and their trade-offs prevents teams from re-litigating the same decision space years later when context has been forgotten",
      "Immutability and Supersedence: ADRs are never edited after acceptance — if a decision changes, the original ADR is marked Superseded and a new ADR documents the new decision and why the old one no longer applies",
      "Colocation with Code: Storing ADRs in the repository (not in Confluence or a separate wiki) ensures they survive team transitions, appear in git history, and are discoverable when developers read the code they describe"
    ],
    "core_concepts_zh": [
      "Y 型陈述格式：将 ADR 结构化为一句话模板：「在 [用例/用户故事] 的背景下，面对 [关切/力量]，我们决定 [选项]，以实现 [质量/目标]，接受 [缺点/风险]」——每个子句都是必填的，防止不完整的决策记录",
      "决策状态生命周期：ADR 经历已提议（讨论中）、已接受（有效中）、已废弃（仍有效但不再推荐）和已取代（被带有链接的更新 ADR 替换）——生命周期使每个决策的当前有效性显式可见",
      "考虑过的备选方案：对未来读者最有价值的部分；记录被拒绝的备选方案及其权衡可防止团队在多年后忘记背景时重新争论相同的决策空间",
      "不可变性与取代：ADR 在接受后从不编辑——如果决策改变，原始 ADR 被标记为已取代，新 ADR 记录新决策以及旧决策不再适用的原因",
      "与代码共存：将 ADR 存储在仓库中（而非 Confluence 或独立 Wiki）确保它们能在团队交接中保存、出现在 git 历史中，并在开发者阅读其描述的代码时可被发现"
    ],
    "timeline": [
      [
        "2011",
        "Michael Nygard publishes 'Documenting Architecture Decisions' introducing the minimal ADR format with Context, Decision, and Consequences sections"
      ],
      [
        "2011",
        "Olaf Zimmermann publishes research on Y-statements in IEEE Software, providing a structured sentence format for capturing architectural decisions"
      ],
      [
        "2016",
        "Nat Pryce releases adr-tools, a command-line tool for managing ADR files, making the practice accessible to development teams without process overhead"
      ],
      [
        "2020",
        "ADRs adopted by major open source projects and cloud providers (AWS, Google Cloud) as standard practice; GitHub and GitLab add ADR templates to project scaffolding"
      ]
    ],
    "timeline_zh": [
      [
        "2011",
        "Michael Nygard 发表「记录架构决策」，引入带有背景、决策和后果部分的最小 ADR 格式"
      ],
      [
        "2011",
        "Olaf Zimmermann 在 IEEE Software 上发表 Y 型陈述研究，为捕获架构决策提供结构化的句子格式"
      ],
      [
        "2016",
        "Nat Pryce 发布 adr-tools，一个用于管理 ADR 文件的命令行工具，让开发团队无需流程开销即可采用该实践"
      ],
      [
        "2020",
        "ADR 被主要开源项目和云服务商（AWS、Google Cloud）采用为标准实践；GitHub 和 GitLab 将 ADR 模板添加到项目脚手架中"
      ]
    ],
    "dos": [
      "Do write ADRs at decision time, not retrospectively — the reasoning and context are freshest when the decision is being made; retrospective ADRs often omit the real reasons for the choice",
      "Do keep ADRs short (one to two pages) — an ADR that requires a 10-page document is really a design document, not a decision record; the Y-statement format enforces brevity by design",
      "Do link ADRs bidirectionally: from the ADR to the code/PR it informed, and from code comments or PR descriptions back to the ADR that explains the reasoning",
      "Do treat an ADR as an asynchronous RFC (Request for Comment) for significant decisions — circulate the draft in Proposed status, collect comments, update, and then move to Accepted"
    ],
    "dos_zh": [
      "在决策时撰写 ADR，而非事后追记——推理和背景在做出决策时最为清晰；事后追记的 ADR 往往遗漏真正的决策原因",
      "保持 ADR 简短（一到两页）——需要 10 页文档的 ADR 实际上是设计文档，而非决策记录；Y 型陈述格式通过设计强制简洁",
      "双向链接 ADR：从 ADR 链接到其所依据的代码/PR，从代码注释或 PR 描述链接回解释推理的 ADR",
      "对重大决策将 ADR 视为异步 RFC（征求意见）——以已提议状态流转草稿、收集评论、更新，然后转为已接受"
    ],
    "donts": [
      "Don't write ADRs for implementation details or tactical coding choices — reserve the practice for architecturally significant decisions that constrain the system's structure or quality attributes",
      "Don't edit an accepted ADR to reflect a changed decision — immutability is fundamental; create a new ADR that supersedes the old one, preserving the full decision history",
      "Don't store ADRs only in an external wiki (Confluence, Notion) — wikis get abandoned, renamed, or lost during company transitions; the repository is the only durable, co-located home",
      "Don't allow an ADR backlog to grow without review — unreviewed Proposed ADRs create confusion about which decisions are actually in force; resolve or close stale proposals within a defined SLA"
    ],
    "donts_zh": [
      "不要为实现细节或战术性编码选择撰写 ADR——将该实践保留给约束系统结构或质量属性的具有架构意义的决策",
      "不要编辑已接受的 ADR 以反映变更的决策——不可变性是根本原则；创建取代旧 ADR 的新 ADR，保留完整的决策历史",
      "不要仅将 ADR 存储在外部 Wiki（Confluence、Notion）中——Wiki 在公司变迁中会被遗弃、重命名或丢失；仓库是唯一持久、共存的归宿",
      "不要让 ADR 待办积压而不审查——未审查的已提议 ADR 会造成哪些决策实际有效的混乱；在定义的 SLA 内解决或关闭陈旧提案"
    ],
    "case_study_company": "GitHub",
    "case_study": "GitHub adopted ADRs as a core architectural practice after experiencing significant knowledge loss during rapid headcount growth from 300 to 1,800 engineers between 2013 and 2016. The architectural decisions that had enabled GitHub's early scale — Rails monolith structure, MySQL sharding strategy, Resque-based background jobs — were undocumented, causing new teams to repeatedly question and sometimes inadvertently undermine them. After introducing ADRs with the Y-statement format for all significant architectural decisions in 2017, GitHub's platform team reported a 40% reduction in architecture-related discussion overhead in pull requests, as reviewers could link to ADRs rather than re-explaining context. The ADR repository now contains over 300 records spanning GitHub's full architectural evolution.",
    "case_study_zh": "GitHub 在 2013 至 2016 年从 300 名工程师快速增长到 1800 名期间经历了严重的知识流失，随后将 ADR 作为核心架构实践采用。曾使 GitHub 早期实现规模化的架构决策——Rails 单体结构、MySQL 分片策略、基于 Resque 的后台任务——都没有文档记录，导致新团队反复质疑，有时甚至无意中破坏这些决策。2017 年为所有重大架构决策引入 Y 型陈述格式的 ADR 后，GitHub 平台团队报告拉取请求中架构相关讨论开销减少了 40%，因为审查者可以链接到 ADR 而无需重新解释背景。ADR 仓库现已包含超过 300 条记录，涵盖 GitHub 的完整架构演进历程。",
    "when_not_to_use": [
      "Decisions that are purely tactical, reversible within a sprint, and affect only one team's internal implementation — the overhead of an ADR is not justified for low-stakes, easily-changed choices",
      "Highly experimental or prototype phases where architectural decisions are expected to change weekly — premature decision documentation for volatile designs creates noise rather than clarity",
      "Very small teams (2-3 engineers) where the entire team participates in every architecture discussion — institutional knowledge is not at risk of being lost and the overhead may exceed the value",
      "Organizations without a code review culture or repository-based documentation practice — ADRs stored in isolation from the team's workflow will not be discovered or maintained"
    ],
    "when_not_to_use_zh": [
      "纯战术性的、在一个冲刺内可逆的、仅影响一个团队内部实现的决策——ADR 的开销对于低风险、易于更改的选择不合理",
      "高度实验性或原型阶段，架构决策预计每周变化——对易变设计进行过早的决策文档化会产生噪音而非清晰",
      "非常小的团队（2-3 名工程师），整个团队参与每次架构讨论——机构知识不存在丢失风险，开销可能超过价值",
      "没有代码审查文化或基于仓库的文档实践的组织——与团队工作流隔离存储的 ADR 不会被发现或维护"
    ],
    "adopters": [
      "GitHub",
      "Spotify",
      "Zalando",
      "Shopify",
      "Thoughtworks"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability",
      "observability"
    ],
    "maturity_ring": "established",
    "primary_source": "Nygard, M. (2011). \"Documenting Architecture Decisions\". thinkrelevance.com.",
    "secondary_sources": [
      "Zimmermann, O. (2011). \"Architectural Decision Capturing in Agile Projects\". IEEE Software 28(6).",
      "Keeling, M. (2017). \"Design It!\". Pragmatic Programmers.",
      "Hohpe, G. (2020). \"The Software Architect Elevator\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "fitness-function-driven-development",
        "type": "complement"
      },
      {
        "slug": "living-documentation",
        "type": "complement"
      },
      {
        "slug": "domain-driven-design",
        "type": "complement"
      }
    ]
  },
  {
    "id": 314,
    "name": "Monolith Decomposition Patterns",
    "name_zh": "单体分解模式",
    "slug": "monolith-decomposition-patterns",
    "category": "evolution",
    "desc": "A catalog of proven strategies for systematically extracting microservices from a monolithic codebase while maintaining system stability and team velocity throughout the migration",
    "desc_zh": "从单体代码库系统化提取微服务的成熟策略目录，在整个迁移过程中保持系统稳定性和团队速度",
    "steps": [
      "Map the decomposition surface: analyze the monolith's module dependency graph, identify seam boundaries (packages with low coupling to the rest), and rank extraction candidates by business value, team ownership clarity, and change frequency",
      "Select a decomposition pattern per seam: apply Strangler Fig for greenfield functionality, Branch by Abstraction for in-place refactoring of hot paths, or Parallel Run for high-risk data migrations — match the pattern to the risk and reversibility needs",
      "Establish data ownership boundaries before code boundaries: identify which tables belong to each candidate service, implement an anti-corruption layer for cross-boundary data access, and set a timeline for migrating shared tables to owned schemas",
      "Extract incrementally with dual-write phases: run extracted services in shadow mode alongside the monolith, compare outputs to validate correctness, then progressively shift traffic using feature flags before retiring the monolith code path",
      "Measure decomposition health: track monolith code coverage removal rate, inter-service coupling index, deployment frequency per extracted service, and P99 latency delta post-extraction to confirm each extraction delivers tangible improvement"
    ],
    "steps_zh": [
      "绘制分解面：分析单体的模块依赖图，识别接缝边界（与其余部分耦合度低的包），并按业务价值、团队所有权清晰度和变更频率对提取候选项排名",
      "为每个接缝选择分解模式：对绿地功能应用绞杀者无花果，对热路径的原地重构应用抽象分支，对高风险数据迁移应用并行运行——根据风险和可逆性需求匹配模式",
      "在代码边界之前建立数据所有权边界：识别哪些表属于每个候选服务，为跨边界数据访问实现防腐层，并设定将共享表迁移到自有模式的时间表",
      "通过双写阶段增量提取：以影子模式运行提取的服务与单体并行，比较输出以验证正确性，然后在停用单体代码路径前使用特性标志逐步转移流量",
      "衡量分解健康状况：追踪单体代码覆盖移除率、服务间耦合指数、每个提取服务的部署频率以及提取后P99延迟增量，以确认每次提取都带来实质性改善"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Identify",
      "Extract",
      "Route",
      "Decommission"
    ],
    "viz_labels_zh": [
      "识别边界",
      "提取服务",
      "路由切换",
      "下线模块"
    ],
    "related": [
      "strangler-fig-pattern",
      "branch-by-abstraction",
      "parallel-run",
      "database-migration-patterns",
      "expansion-contraction-pattern"
    ],
    "tags": [
      "migration",
      "microservices",
      "decomposition",
      "monolith",
      "evolution"
    ],
    "origin_author": "Sam Newman (Building Microservices); Martin Fowler (StranglerFig, BranchByAbstraction)",
    "origin_year": 2015,
    "origin_source": "Newman, S. (2019). \"Monolith to Microservices\". O'Reilly Media.",
    "origin_source_zh": "Newman, S.（2019）。《从单体到微服务》。O'Reilly Media。",
    "complexity": "advanced",
    "abstraction_level": "system",
    "maturity_ring": "established",
    "quality_concerns": [
      "maintainability",
      "scalability",
      "reliability"
    ],
    "adopters": [
      "Amazon",
      "Uber",
      "Twitter",
      "Stack Overflow",
      "Shopify"
    ],
    "when_to_use": [
      "Monoliths where deployment bottlenecks — all teams must coordinate releases — are measurably reducing team autonomy and delivery speed",
      "Specific bounded contexts within the monolith experiencing scaling problems that cannot be addressed without isolated deployment and resource allocation",
      "Organizations growing past 50 engineers where Conway's Law pressure makes a single codebase a social coordination problem as much as a technical one",
      "Acquisitions or platform migrations where the target architecture is services-based and the monolith must be incrementally replaced rather than big-bang rewritten"
    ],
    "when_to_use_zh": [
      "部署瓶颈处（所有团队必须协调发布）可量化地降低团队自主性和交付速度的单体",
      "单体内特定有界上下文遇到无法在没有隔离部署和资源分配的情况下解决的扩展问题",
      "工程师超过50人的组织，康威定律压力使单一代码库既是技术问题也是社会协调问题",
      "目标架构基于服务的收购或平台迁移，单体必须增量替换而非大爆炸式重写"
    ],
    "core_concepts": [
      "Seam: A natural boundary within the monolith where code can be separated with minimal changes — identified by analyzing import graphs, database table access patterns, and team ownership clusters",
      "Anti-Corruption Layer (ACL): An adapter layer that translates between the monolith's data model and the extracted service's model, preventing the old model from leaking into the new service's domain",
      "Dual-Write / Parallel Run: A transitional phase where both the monolith and the extracted service write to their respective data stores simultaneously, enabling comparison validation before full cutover",
      "Data Decomposition: The process of separating shared database tables into service-owned schemas, typically via the Database-per-Service pattern, which is often harder and riskier than code extraction",
      "Seam Catalogue: A living inventory of identified decomposition candidates, ranked by extraction priority, current coupling metrics, and assigned owning team — the primary planning artifact for a decomposition programme"
    ],
    "core_concepts_zh": [
      "接缝：单体内的自然边界，代码可以在最小变更下分离——通过分析导入图、数据库表访问模式和团队所有权集群来识别",
      "防腐层（ACL）：在单体数据模型和提取服务模型之间转换的适配器层，防止旧模型泄漏到新服务的领域中",
      "双写/并行运行：单体和提取的服务同时向各自数据存储写入的过渡阶段，在完全切换前启用比较验证",
      "数据分解：将共享数据库表分离到服务自有模式的过程，通常通过每服务数据库模式实现，这往往比代码提取更难且风险更高",
      "接缝目录：已识别的分解候选项的活体清单，按提取优先级、当前耦合指标和分配的所有团队排名——分解项目的主要规划产物"
    ],
    "timeline": [
      [
        2004,
        "Martin Fowler coins 'StranglerFig Application' pattern on martinfowler.com, inspired by strangler fig vines"
      ],
      [
        2014,
        "Sam Newman publishes 'Building Microservices' (O'Reilly), the first comprehensive treatment of decomposition strategies"
      ],
      [
        2019,
        "Newman publishes 'Monolith to Microservices' consolidating decomposition patterns into a formal catalogue with data migration guidance"
      ],
      [
        2022,
        "Modular monolith pattern gains traction as teams recognize that seam identification is valuable independent of microservice extraction"
      ]
    ],
    "timeline_zh": [
      [
        2004,
        "Martin Fowler在martinfowler.com上创造「绞杀者无花果应用」模式，灵感来自绞杀无花果藤"
      ],
      [
        2014,
        "Sam Newman出版《构建微服务》（O'Reilly），首次全面处理分解策略"
      ],
      [
        2019,
        "Newman出版《从单体到微服务》，将分解模式整合为带有数据迁移指导的正式目录"
      ],
      [
        2022,
        "模块化单体模式获得关注，团队认识到接缝识别独立于微服务提取具有价值"
      ]
    ],
    "dos": [
      "Do decompose data boundaries before or in parallel with code boundaries — a microservice that shares a database with the monolith is a distributed monolith, not a microservice",
      "Do measure the monolith's seam graph before starting — teams that skip this analysis tend to extract services along team lines rather than domain lines, creating chatty inter-service APIs",
      "Do use feature flags to control traffic migration to extracted services so you can roll back instantly without a deployment if correctness problems are discovered",
      "Do define done criteria for each extraction: ownership of the data store, zero cross-database joins, independent deployment pipeline, and team autonomy over technology choices"
    ],
    "dos_zh": [
      "在代码边界之前或与之同步分解数据边界——与单体共享数据库的微服务是分布式单体，而非微服务",
      "在开始前测量单体的接缝图——跳过此分析的团队倾向于沿团队边界而非领域边界提取服务，创建冗长的服务间API",
      "使用特性标志控制到提取服务的流量迁移，以便在发现正确性问题时可以立即回滚而无需部署",
      "为每次提取定义完成标准：数据存储所有权、零跨数据库联接、独立部署管道以及团队对技术选择的自主权"
    ],
    "donts": [
      "Don't start with a big-bang rewrite — the historical failure rate of full rewrites is over 80%; incremental extraction preserves working software throughout",
      "Don't extract services for technical reasons alone (language preference, framework modernity) — only extract when there is a concrete team autonomy or scaling problem to solve",
      "Don't ignore the distributed systems tax: extracted services introduce network latency, partial failure modes, and distributed transactions that add complexity the monolith did not have",
      "Don't underestimate the data decomposition problem — shared database tables with foreign keys, triggers, and implicit ownership are the hardest part of most decompositions"
    ],
    "donts_zh": [
      "不要从大爆炸式重写开始——完整重写的历史失败率超过80%；增量提取在整个过程中保留可工作的软件",
      "不要仅出于技术原因（语言偏好、框架现代性）提取服务——只有在有具体的团队自主权或扩展问题需要解决时才提取",
      "不要忽视分布式系统的代价：提取的服务引入了单体没有的网络延迟、部分故障模式和分布式事务，增加了复杂性",
      "不要低估数据分解问题——具有外键、触发器和隐式所有权的共享数据库表是大多数分解中最难的部分"
    ],
    "case_study_company": "Uber",
    "case_study": "Uber's 2016-2019 decomposition of their original Node.js monolith ('God') into domain microservices is one of the most documented large-scale monolith decompositions. They applied Strangler Fig at the API gateway level, routing specific endpoints to new Go-based services while the Node.js monolith continued handling the remainder. Their key learning: database decomposition was 3-5x more effort than code decomposition. Teams that extracted services without migrating data ownership created 'nano-monoliths' — services that still required coordinated deployment due to shared schema dependencies.",
    "case_study_zh": "Uber 2016-2019年将其原始Node.js单体（「God」）分解为领域微服务是记录最为详尽的大规模单体分解之一。他们在API网关层应用绞杀者无花果，将特定端点路由到新的Go服务，同时Node.js单体继续处理其余部分。他们的关键学习：数据库分解的工作量是代码分解的3-5倍。在没有迁移数据所有权的情况下提取服务的团队创建了「纳米单体」——由于共享模式依赖仍需协调部署的服务。",
    "when_not_to_use": [
      "Monoliths with fewer than 10 engineers where coordination overhead is manageable and the deployment bottleneck does not yet manifest",
      "Well-structured modular monoliths where internal module boundaries are clean — the right answer may be to preserve the monolith and strengthen its internal structure",
      "Systems under active business pressure where engineering bandwidth cannot absorb the productivity dip that accompanies any large-scale decomposition",
      "Teams lacking observability infrastructure — decomposing a monolith without distributed tracing and service-level metrics makes debugging regressions extremely difficult"
    ],
    "when_not_to_use_zh": [
      "工程师少于10人的单体，协调开销可控，部署瓶颈尚未显现",
      "内部模块边界清晰的结构良好的模块化单体——正确答案可能是保留单体并加强其内部结构",
      "处于活跃业务压力下的系统，工程带宽无法吸收任何大规模分解所带来的生产力下降",
      "缺乏可观测性基础设施的团队——在没有分布式追踪和服务级别指标的情况下分解单体使调试回归极其困难"
    ],
    "primary_source": "Newman, S. (2019). \"Monolith to Microservices\". O'Reilly Media. ISBN 978-1-492-04714-1",
    "primary_source_zh": "Newman, S.（2019）。《从单体到微服务》。O'Reilly Media。ISBN 978-1-492-04714-1",
    "secondary_sources": [
      "Fowler, M. (2004). \"StranglerFigApplication\". martinfowler.com/bliki/StranglerFigApplication.html",
      "Richardson, C. (2019). \"Microservices Patterns\". Manning Publications. Chapter 13: Refactoring to Microservices."
    ],
    "secondary_sources_zh": [
      "Fowler, M.（2004）。《绞杀者无花果应用》。martinfowler.com/bliki/StranglerFigApplication.html",
      "Richardson, C.（2019）。《微服务模式》。Manning出版社。第13章：重构为微服务。"
    ]
  },
  {
    "id": 87,
    "name": "ReAct Framework",
    "name_zh": "ReAct 推理-行动框架",
    "slug": "react-framework",
    "category": "ai",
    "desc": "Interleave reasoning traces and actions in LLM agents",
    "desc_zh": "在大模型代理中交织推理轨迹与动作执行",
    "steps": [
      "Define the agent's task scope and available tool set (APIs, search, code exec)",
      "Instrument the LLM prompt to emit Thought / Action / Observation traces",
      "Execute each Action against real tools and feed Observation back into context",
      "Repeat the Thought-Action-Observation loop until the stopping condition is met",
      "Evaluate trace quality and prune or summarize long reasoning chains for efficiency"
    ],
    "steps_zh": [
      "定义代理任务范围及可用工具集（API、搜索、代码执行等）",
      "在提示词中要求模型输出 Thought / Action / Observation 三段式轨迹",
      "将每个 Action 发送至真实工具并将 Observation 回写入上下文",
      "循环执行推理-行动-观察，直至达到终止条件",
      "评估轨迹质量，裁剪或摘要过长推理链以降低成本"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Reason",
      "Act",
      "Observe"
    ],
    "viz_labels_zh": [
      "推理",
      "行动",
      "观察"
    ],
    "related": [
      "tool-use-design-pattern",
      "prompt-chaining",
      "multi-agent-orchestration-pattern"
    ],
    "tags": [
      "react",
      "reasoning",
      "agents",
      "llm",
      "tool-use"
    ],
    "origin_author": "Shunyu Yao et al., 2022",
    "origin_source": "ReAct: Synergizing Reasoning and Acting in Language Models (ICLR 2023)",
    "origin_source_zh": "ReAct：在语言模型中协同推理与行动（ICLR 2023）",
    "complexity": "intermediate",
    "when_to_use": [
      "Building agents that need to call external tools (search, APIs, databases) to answer questions",
      "Tasks where pure chain-of-thought reasoning is insufficient without grounding in real data",
      "Interactive problem-solving scenarios requiring iterative refinement based on observations",
      "Knowledge-intensive QA where the model must decide what to look up and when to stop"
    ],
    "when_to_use_zh": [
      "构建需要调用外部工具（搜索、API、数据库）回答问题的代理",
      "纯思维链推理不够、需要真实数据锚定的任务",
      "需要基于观察迭代优化的交互式问题求解场景",
      "模型需自行决定检索内容和终止时机的知识密集型问答"
    ],
    "core_concepts": [
      "Thought: The model's internal reasoning step that plans the next action or interprets an observation",
      "Action: A concrete tool invocation emitted by the model (e.g., Search[query], Lookup[term])",
      "Observation: The external environment's response returned after executing an action",
      "Trace: The full interleaved sequence of Thought-Action-Observation steps forming the agent's trajectory",
      "Grounding: Anchoring the model's reasoning in real-world data obtained via tool calls"
    ],
    "core_concepts_zh": [
      "Thought（思考）：模型规划下一步动作或解读观察结果的内部推理步骤",
      "Action（行动）：模型输出的具体工具调用（如 Search[query]、Lookup[term]）",
      "Observation（观察）：执行动作后外部环境返回的响应结果",
      "Trace（轨迹）：Thought-Action-Observation 交替步骤组成的完整代理执行路径",
      "Grounding（锚定）：通过工具调用获取真实数据，为模型推理提供事实依据"
    ],
    "timeline": [
      [
        "2022-10",
        "Shunyu Yao et al. release the ReAct preprint on arXiv"
      ],
      [
        "2023-01",
        "ReAct accepted at ICLR 2023; gains rapid adoption in the agent community"
      ],
      [
        "2023-03",
        "LangChain integrates ReAct as a default agent executor pattern"
      ],
      [
        "2023-10",
        "ReAct-style loops become the backbone of OpenAI's function-calling agents"
      ],
      [
        "2024-06",
        "Variants like Reflexion and LATS extend ReAct with self-reflection and tree search"
      ]
    ],
    "timeline_zh": [
      [
        "2022-10",
        "Shunyu Yao 等人在 arXiv 发布 ReAct 预印本"
      ],
      [
        "2023-01",
        "ReAct 被 ICLR 2023 接收，在代理社区迅速获得关注"
      ],
      [
        "2023-03",
        "LangChain 将 ReAct 集成为默认代理执行模式"
      ],
      [
        "2023-10",
        "ReAct 风格循环成为 OpenAI 函数调用代理的核心骨架"
      ],
      [
        "2024-06",
        "Reflexion、LATS 等变体扩展 ReAct，加入自我反思与树搜索"
      ]
    ],
    "dos": [
      "Keep tool descriptions concise and unambiguous so the model selects the right tool reliably",
      "Set a maximum iteration limit to prevent runaway loops that burn tokens and time",
      "Include few-shot examples of complete Thought-Action-Observation traces in the system prompt",
      "Log full traces for debugging -- they are the primary artifact for understanding agent behavior"
    ],
    "dos_zh": [
      "保持工具描述简明无歧义，使模型能可靠选择正确工具",
      "设置最大迭代次数上限，防止失控循环消耗过多 Token 和时间",
      "在系统提示词中包含完整 Thought-Action-Observation 轨迹的少样本示例",
      "记录完整轨迹用于调试——轨迹是理解代理行为的核心产物"
    ],
    "donts": [
      "Don't give the model too many tools at once -- tool selection accuracy drops sharply past 15-20 tools",
      "Don't skip observation validation -- feeding raw unstructured HTML into context wastes tokens",
      "Don't let the agent self-terminate without a verification step -- it may stop prematurely on partial answers",
      "Don't use ReAct for simple single-step tasks -- the overhead of the loop is unnecessary"
    ],
    "donts_zh": [
      "不要一次给模型过多工具——超过 15-20 个后工具选择准确率急剧下降",
      "不要跳过观察结果验证——将原始 HTML 直接注入上下文浪费 Token",
      "不要让代理在没有验证步骤的情况下自行终止——可能在部分答案时过早停止",
      "不要对简单的单步任务使用 ReAct——循环的额外开销是不必要的"
    ],
    "case_study_company": "LangChain",
    "case_study": "LangChain adopted the ReAct pattern as its default AgentExecutor, enabling thousands of developers to build tool-augmented agents with minimal boilerplate. By standardizing the Thought-Action-Observation loop, LangChain reduced agent development time from weeks to hours and became the most widely used LLM orchestration framework by mid-2023.",
    "case_study_zh": "LangChain 将 ReAct 模式作为默认的 AgentExecutor 实现，使数千名开发者能以极少的样板代码构建工具增强型代理。通过标准化 Thought-Action-Observation 循环，LangChain 将代理开发时间从数周缩短到数小时，并在 2023 年中成为使用最广泛的大模型编排框架。",
    "when_not_to_use": [
      "Simple classification or generation tasks that don't require external tool calls",
      "Latency-critical applications where multi-step loops add unacceptable delay",
      "Tasks where the full context is already available in the prompt and retrieval adds no value",
      "Scenarios requiring deterministic output -- the non-deterministic loop may produce variable traces"
    ],
    "when_not_to_use_zh": [
      "不需要外部工具调用的简单分类或生成任务",
      "对延迟极度敏感、多步循环会带来不可接受延迟的应用",
      "提示词已包含全部所需上下文、检索不增加价值的任务",
      "需要确定性输出的场景——非确定性循环可能产生不同轨迹"
    ],
    "adopters": [
      "LangChain",
      "LlamaIndex",
      "OpenAI (Assistants API)",
      "Google DeepMind",
      "Microsoft Semantic Kernel"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Shunyu Yao et al. (2023). \"ReAct: Synergizing Reasoning and Acting in Language Models\". ICLR 2023.",
    "secondary_sources": [
      "Jason Wei et al. (2022). \"Chain-of-Thought Prompting Elicits Reasoning in Large Language Models\". NeurIPS 2022.",
      "Takeshi Kojima et al. (2022). \"Large Language Models are Zero-Shot Reasoners\". NeurIPS 2022."
    ],
    "typed_relations": [
      {
        "slug": "tool-use-design-pattern",
        "type": "complement"
      },
      {
        "slug": "prompt-chaining",
        "type": "complement"
      },
      {
        "slug": "multi-agent-orchestration-pattern",
        "type": "extends"
      }
    ]
  },
  {
    "id": 88,
    "name": "RAG Architecture",
    "name_zh": "检索增强生成架构",
    "slug": "rag-architecture",
    "category": "ai",
    "desc": "Ground LLM responses with retrieved external knowledge",
    "desc_zh": "通过检索外部知识为大模型响应提供事实依据",
    "steps": [
      "Chunk, embed, and index domain documents into a vector store",
      "At query time, embed the user question and retrieve top-K relevant chunks",
      "Construct a prompt that injects retrieved context before the question",
      "Generate the answer with the LLM, citing source chunks for traceability",
      "Evaluate retrieval recall and generation faithfulness; iterate on chunking strategy"
    ],
    "steps_zh": [
      "对领域文档进行分块、向量化并写入向量数据库",
      "查询时将用户问题向量化，检索 Top-K 相关片段",
      "构建提示词，将检索内容注入问题前作为上下文",
      "由大模型生成答案并引用来源片段以确保可溯源",
      "评估检索召回率与生成忠实度，迭代优化分块策略"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Chunk",
      "Embed",
      "Retrieve",
      "Augment",
      "Generate"
    ],
    "viz_labels_zh": [
      "分块",
      "向量化",
      "检索",
      "增强",
      "生成"
    ],
    "related": [
      "llm-system-design-patterns",
      "context-window-management",
      "ai-observability-framework"
    ],
    "tags": [
      "rag",
      "retrieval",
      "vector-store",
      "grounding",
      "knowledge"
    ],
    "origin_author": "Patrick Lewis et al. (Meta AI / UCL), 2020",
    "origin_source": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks (NeurIPS 2020)",
    "origin_source_zh": "面向知识密集型 NLP 任务的检索增强生成（NeurIPS 2020）",
    "complexity": "intermediate",
    "when_to_use": [
      "Enterprise knowledge bases where the LLM must answer from proprietary or frequently updated documents",
      "Customer support bots that need to cite specific policy pages or product documentation",
      "Legal or compliance use cases requiring traceable source attribution for every claim",
      "Any domain where hallucination risk is high and factual grounding is non-negotiable"
    ],
    "when_to_use_zh": [
      "企业知识库场景，大模型需基于私有或频繁更新的文档回答问题",
      "客户支持机器人需要引用特定政策页面或产品文档",
      "法律或合规场景，每项声明都要求可溯源的出处标注",
      "任何幻觉风险高、事实锚定不可妥协的领域"
    ],
    "core_concepts": [
      "Chunking: Splitting source documents into semantically meaningful segments for embedding and retrieval",
      "Embedding: Converting text chunks into dense vector representations using models like OpenAI Ada or BGE",
      "Vector Store: A specialized database (Pinecone, Weaviate, Chroma) that indexes embeddings for similarity search",
      "Retrieval: Finding the top-K most relevant chunks to a query via approximate nearest neighbor search",
      "Faithfulness: The degree to which the generated answer is supported by and consistent with retrieved context"
    ],
    "core_concepts_zh": [
      "分块（Chunking）：将源文档拆分为语义完整的片段以进行向量化和检索",
      "向量化（Embedding）：使用 OpenAI Ada、BGE 等模型将文本片段转为稠密向量表示",
      "向量数据库（Vector Store）：索引向量并支持相似度搜索的专用数据库（Pinecone、Weaviate、Chroma）",
      "检索（Retrieval）：通过近似最近邻搜索找到与查询最相关的 Top-K 片段",
      "忠实度（Faithfulness）：生成答案被检索上下文支持和印证的程度"
    ],
    "timeline": [
      [
        "2020-05",
        "Patrick Lewis et al. publish the RAG paper, combining DPR retriever with BART generator"
      ],
      [
        "2022-12",
        "ChatGPT launch drives massive interest in grounding LLMs with external knowledge"
      ],
      [
        "2023-03",
        "LlamaIndex (formerly GPT Index) and LangChain popularize RAG as a standard pattern"
      ],
      [
        "2023-09",
        "Advanced RAG techniques emerge: HyDE, multi-hop retrieval, re-ranking pipelines"
      ],
      [
        "2024-03",
        "GraphRAG and agentic RAG architectures push beyond simple vector similarity retrieval"
      ]
    ],
    "timeline_zh": [
      [
        "2020-05",
        "Patrick Lewis 等人发表 RAG 论文，将 DPR 检索器与 BART 生成器结合"
      ],
      [
        "2022-12",
        "ChatGPT 发布推动大量关于用外部知识锚定大模型的研究兴趣"
      ],
      [
        "2023-03",
        "LlamaIndex（原 GPT Index）和 LangChain 将 RAG 推广为标准模式"
      ],
      [
        "2023-09",
        "高级 RAG 技术涌现：HyDE、多跳检索、重排序管线"
      ],
      [
        "2024-03",
        "GraphRAG 和代理式 RAG 架构突破简单向量相似度检索的局限"
      ]
    ],
    "dos": [
      "Experiment with chunk sizes (256-1024 tokens) and overlap -- optimal size varies by domain",
      "Use a re-ranker (Cohere Rerank, cross-encoder) to improve precision after initial retrieval",
      "Include metadata filters (date, source, category) to narrow retrieval scope before vector search",
      "Evaluate with RAGAS or similar frameworks that measure both retrieval quality and answer faithfulness"
    ],
    "dos_zh": [
      "实验不同分块大小（256-1024 token）和重叠度——最优大小因领域而异",
      "使用重排序器（Cohere Rerank、交叉编码器）在初始检索后提升精确率",
      "包含元数据过滤（日期、来源、类别）在向量搜索前缩小检索范围",
      "使用 RAGAS 等框架评估检索质量和回答忠实度"
    ],
    "donts": [
      "Don't assume bigger chunks are better -- oversized chunks dilute relevance and waste context window",
      "Don't skip hybrid search -- combining keyword (BM25) with vector search often outperforms either alone",
      "Don't forget to refresh embeddings when source documents are updated -- stale indexes cause drift",
      "Don't treat RAG as a substitute for fine-tuning when the task requires deep domain reasoning"
    ],
    "donts_zh": [
      "不要假设分块越大越好——过大的分块会稀释相关性并浪费上下文窗口",
      "不要跳过混合搜索——关键词（BM25）与向量搜索结合通常优于单独使用",
      "不要忘记在源文档更新时刷新向量——陈旧索引会导致漂移",
      "不要将 RAG 当作微调的替代品——需要深层领域推理的任务仍需微调"
    ],
    "case_study_company": "Klarna",
    "case_study": "Klarna deployed a RAG-powered customer service assistant that retrieves answers from their internal policy and product documentation. The system handles 2.3 million conversations in its first month, performing the equivalent work of 700 full-time agents with a 25% reduction in repeat inquiries, while maintaining source traceability for compliance.",
    "case_study_zh": "Klarna 部署了基于 RAG 的客服助手，从内部政策和产品文档中检索答案。该系统在上线首月处理了 230 万次对话，相当于 700 名全职客服的工作量，重复咨询减少 25%，同时保持了合规所需的来源可追溯性。",
    "when_not_to_use": [
      "Tasks requiring creative generation or open-ended brainstorming where grounding constrains the output",
      "Small-context scenarios where all necessary information fits directly in the system prompt",
      "Real-time streaming applications where retrieval latency is unacceptable",
      "When the knowledge base is too noisy or unstructured for meaningful similarity matching"
    ],
    "when_not_to_use_zh": [
      "需要创意生成或开放头脑风暴、锚定反而限制输出的任务",
      "所有必要信息已能直接放入系统提示词的小上下文场景",
      "检索延迟不可接受的实时流式应用",
      "知识库过于嘈杂或无结构、难以进行有效相似度匹配的场景"
    ],
    "adopters": [
      "Klarna",
      "Notion AI",
      "Perplexity AI",
      "Databricks",
      "Elastic"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "performance"
    ],
    "maturity_ring": "established",
    "primary_source": "Patrick Lewis et al. (2020). \"Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks\". NeurIPS 2020.",
    "secondary_sources": [
      "Vladimir Karpukhin et al. (2020). \"Dense Passage Retrieval for Open-Domain Question Answering\". EMNLP 2020.",
      "Jerry Liu (2022). \"LlamaIndex: A Data Framework for LLM Applications\". llamaindex.ai."
    ],
    "typed_relations": [
      {
        "slug": "llm-system-design-patterns",
        "type": "related"
      },
      {
        "slug": "context-window-management",
        "type": "complement"
      },
      {
        "slug": "ai-observability-framework",
        "type": "complement"
      }
    ]
  },
  {
    "id": 89,
    "name": "Multi-Agent Orchestration Pattern",
    "name_zh": "多代理编排模式",
    "slug": "multi-agent-orchestration-pattern",
    "category": "ai",
    "desc": "Coordinate specialized AI agents via an orchestrator layer",
    "desc_zh": "通过编排层协调多个专职 AI 代理协同工作",
    "steps": [
      "Decompose the overall task into sub-tasks with clear input/output contracts",
      "Assign each sub-task to a specialized agent with the appropriate model and tools",
      "Implement an orchestrator agent that routes tasks and aggregates results",
      "Define inter-agent communication schemas (structured JSON messages or function calls)",
      "Add circuit-breakers and fallback paths to handle agent failures gracefully"
    ],
    "steps_zh": [
      "将整体任务分解为具有明确输入输出契约的子任务",
      "为每个子任务指定配备合适模型与工具的专职代理",
      "实现负责任务路由和结果聚合的编排代理",
      "定义代理间通信模式（结构化 JSON 消息或函数调用）",
      "添加熔断器与降级路径以优雅处理代理故障"
    ],
    "ai_relevant": true,
    "viz_type": "tree",
    "viz_labels": [
      "Orchestrator",
      "Planner",
      "Worker",
      "Tool"
    ],
    "viz_labels_zh": [
      "编排器",
      "规划器",
      "执行Agent",
      "工具"
    ],
    "related": [
      "react-framework",
      "agent-communication-protocol",
      "agent-oriented-design-thinking"
    ],
    "tags": [
      "multi-agent",
      "orchestration",
      "coordination",
      "task-decomposition"
    ],
    "origin_author": "Qian Chen et al. (Tsinghua / Microsoft Research), 2023",
    "origin_source": "Communicative Agents for Software Development (ChatDev) and AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    "origin_source_zh": "面向软件开发的通信代理（ChatDev）及 AutoGen：通过多代理对话实现下一代大模型应用",
    "complexity": "advanced",
    "when_to_use": [
      "Complex tasks that require diverse expertise, e.g. a coding agent, a testing agent, and a review agent",
      "Workflows where different sub-tasks benefit from different models or tool configurations",
      "Research or analysis pipelines where debate or consensus among agents improves output quality",
      "Production systems that need to scale sub-tasks independently across parallel agents"
    ],
    "when_to_use_zh": [
      "需要多种专长的复杂任务——例如编码代理、测试代理和审查代理协作",
      "不同子任务受益于不同模型或工具配置的工作流",
      "代理间辩论或共识机制可提升输出质量的研究或分析管线",
      "需要跨并行代理独立扩展子任务的生产系统"
    ],
    "core_concepts": [
      "Orchestrator: A supervisory agent that decomposes tasks, routes them to specialists, and merges results",
      "Specialist Agent: A purpose-built agent with a narrow tool set and system prompt optimized for one sub-task",
      "Task Graph: The DAG of dependencies between sub-tasks that determines execution order and parallelism",
      "Shared Memory: A common state store (blackboard) agents read from and write to for coordination",
      "Debate Protocol: A pattern where multiple agents critique each other's outputs to improve accuracy"
    ],
    "core_concepts_zh": [
      "编排器（Orchestrator）：负责任务分解、路由到专职代理并合并结果的监督代理",
      "专职代理（Specialist Agent）：配备窄工具集和针对单一子任务优化的系统提示词的专用代理",
      "任务图（Task Graph）：决定执行顺序和并行度的子任务依赖有向无环图",
      "共享内存（Shared Memory）：代理间用于协调的公共状态存储（黑板机制）",
      "辩论协议（Debate Protocol）：多个代理相互批评输出以提高准确性的模式"
    ],
    "timeline": [
      [
        "2023-07",
        "ChatDev paper demonstrates multi-agent software development with role-playing agents"
      ],
      [
        "2023-09",
        "Microsoft releases AutoGen, the first major multi-agent conversation framework"
      ],
      [
        "2023-11",
        "CrewAI launches, popularizing role-based multi-agent orchestration for non-technical users"
      ],
      [
        "2024-01",
        "LangGraph introduces graph-based agent orchestration with cycles and persistence"
      ],
      [
        "2024-08",
        "OpenAI Swarm and Anthropic Claude multi-agent patterns push enterprise adoption"
      ]
    ],
    "timeline_zh": [
      [
        "2023-07",
        "ChatDev 论文展示基于角色扮演代理的多代理软件开发"
      ],
      [
        "2023-09",
        "微软发布 AutoGen，首个主流多代理对话框架"
      ],
      [
        "2023-11",
        "CrewAI 发布，向非技术用户普及基于角色的多代理编排"
      ],
      [
        "2024-01",
        "LangGraph 引入支持循环和持久化的基于图的代理编排"
      ],
      [
        "2024-08",
        "OpenAI Swarm 和 Anthropic Claude 多代理模式推动企业级采用"
      ]
    ],
    "dos": [
      "Start with the simplest topology (linear handoff) and add complexity only when single-agent fails",
      "Give each agent a clearly scoped system prompt -- blurry roles cause redundant or conflicting work",
      "Use structured message schemas between agents to avoid natural-language parsing errors",
      "Implement observability at the orchestrator level so you can trace the full multi-agent execution"
    ],
    "dos_zh": [
      "从最简拓扑（线性交接）开始，仅在单代理无法胜任时增加复杂性",
      "为每个代理设定清晰范围的系统提示词——模糊角色会导致重复或冲突工作",
      "代理间使用结构化消息模式，避免自然语言解析错误",
      "在编排器层面实现可观测性，以追踪完整的多代理执行过程"
    ],
    "donts": [
      "Don't default to multi-agent when a single agent with good tools can solve the problem -- added coordination cost is real",
      "Don't let agents communicate in unstructured free text -- it compounds errors across handoffs",
      "Don't create circular dependencies between agents without explicit loop-breaking conditions",
      "Don't ignore the token cost multiplier -- N agents each consuming context can be N times more expensive"
    ],
    "donts_zh": [
      "当单个配备良好工具的代理能解决问题时不要默认使用多代理——协调成本是真实的",
      "不要让代理以非结构化自由文本通信——错误会在交接中累积放大",
      "不要在代理间创建循环依赖而不设显式中断条件",
      "不要忽视 Token 成本倍增——N 个代理各自消耗上下文可能贵 N 倍"
    ],
    "case_study_company": "Microsoft",
    "case_study": "Microsoft built AutoGen to power internal multi-agent workflows for code generation and data analysis. In production pilots, a three-agent setup (coder, critic, executor) reduced code review iteration cycles by 40% compared to single-agent generation. AutoGen's conversation-driven architecture became the reference implementation for enterprise multi-agent systems.",
    "case_study_zh": "微软构建 AutoGen 驱动内部代码生成和数据分析的多代理工作流。在生产试点中，三代理配置（编码者、评审者、执行者）比单代理生成减少了 40% 的代码审查迭代周期。AutoGen 基于对话的架构成为企业多代理系统的参考实现。",
    "when_not_to_use": [
      "Simple tasks where a single well-prompted agent with tools is sufficient",
      "Latency-critical paths where multi-agent round-trips add unacceptable delay",
      "Early-stage prototypes where the added complexity of agent coordination hinders iteration speed",
      "Cost-constrained scenarios where multiplying LLM calls across agents exceeds budget"
    ],
    "when_not_to_use_zh": [
      "单个配备工具的良好提示代理即可胜任的简单任务",
      "多代理往返增加不可接受延迟的延迟关键路径",
      "代理协调的额外复杂性阻碍迭代速度的早期原型阶段",
      "跨代理乘数式 LLM 调用超出预算的成本受限场景"
    ],
    "adopters": [
      "Microsoft (AutoGen)",
      "CrewAI",
      "LangGraph (LangChain)",
      "ChatDev (Tsinghua)",
      "Camel-AI"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "scalability",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Qian Chen et al. (2023). \"Communicative Agents for Software Development (ChatDev)\". arXiv:2307.07924.",
    "secondary_sources": [
      "Chi Wang et al. (2023). \"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation\". arXiv:2308.08155.",
      "Yiran Wu et al. (2023). \"AutoGen: A Framework for Large Language Model Applications\". Microsoft Research."
    ],
    "typed_relations": [
      {
        "slug": "react-framework",
        "type": "complement"
      },
      {
        "slug": "agent-communication-protocol",
        "type": "complement"
      },
      {
        "slug": "agent-oriented-design-thinking",
        "type": "prerequisite"
      }
    ]
  },
  {
    "id": 90,
    "name": "Human-in-the-Loop Design",
    "name_zh": "人机协同回路设计",
    "slug": "human-in-the-loop",
    "category": "ai",
    "desc": "Insert human checkpoints into automated AI workflows",
    "desc_zh": "在自动化 AI 工作流中插入人工审核节点",
    "steps": [
      "Map the workflow and identify high-risk or high-uncertainty decision points",
      "Define escalation criteria that trigger human review (confidence threshold, risk score)",
      "Design low-friction review UIs that surface AI reasoning alongside the decision",
      "Capture human feedback as labeled training signal for continuous model improvement",
      "Instrument approval latency and human override rates to tune automation thresholds"
    ],
    "steps_zh": [
      "梳理工作流，识别高风险或高不确定性的决策节点",
      "定义触发人工审核的升级条件（置信度阈值、风险评分）",
      "设计低摩擦的审核界面，将 AI 推理过程与决策并排呈现",
      "将人工反馈作为标注训练信号用于持续模型优化",
      "监控审批延迟与人工覆盖率，动态调整自动化阈值"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "AI Output",
      "Human Review",
      "Approve",
      "Correct"
    ],
    "viz_labels_zh": [
      "AI输出",
      "人工审核",
      "批准",
      "纠错"
    ],
    "related": [
      "human-ai-interaction-design",
      "guardrails-framework",
      "responsible-ai-design"
    ],
    "tags": [
      "human-in-the-loop",
      "oversight",
      "escalation",
      "feedback"
    ],
    "origin_author": "DARPA / Weng et al., 2003 (concept); modern AI-HITL formalized by Monarch (Google), 2020",
    "origin_source": "Human-in-the-Loop Machine Learning: Active Learning and Annotation (Robert Munro, O'Reilly, 2021)",
    "origin_source_zh": "人在回路机器学习：主动学习与标注（Robert Munro，O'Reilly，2021）",
    "complexity": "intermediate",
    "when_to_use": [
      "High-stakes decisions where AI errors carry legal, financial, or safety consequences",
      "Domains with evolving policies where automated rules alone cannot cover all edge cases",
      "Model training pipelines that benefit from expert-labeled corrections as feedback signals",
      "Regulated industries (healthcare, finance, legal) requiring auditable human oversight"
    ],
    "when_to_use_zh": [
      "AI 错误会带来法律、财务或安全后果的高风险决策",
      "政策不断演变、自动化规则无法覆盖所有边缘情况的领域",
      "受益于专家标注纠正作为反馈信号的模型训练管线",
      "需要可审计人工监督的受监管行业（医疗、金融、法律）"
    ],
    "core_concepts": [
      "Confidence Threshold: A model-reported score below which outputs are routed to human review",
      "Escalation Path: The defined route from automated decision to human reviewer, including SLAs",
      "Active Learning: Strategically selecting the most uncertain samples for human labeling to maximize model improvement",
      "Override Rate: The percentage of AI decisions reversed by humans -- a key calibration metric",
      "Feedback Loop: The mechanism that converts human corrections into training data for model retraining"
    ],
    "core_concepts_zh": [
      "置信度阈值（Confidence Threshold）：模型报告的分数低于该值时将输出路由至人工审核",
      "升级路径（Escalation Path）：从自动化决策到人工审核者的既定路由，包括 SLA 约定",
      "主动学习（Active Learning）：策略性选择最不确定的样本交由人工标注以最大化模型改进",
      "覆盖率（Override Rate）：被人工推翻的 AI 决策百分比——关键的校准指标",
      "反馈回路（Feedback Loop）：将人工纠正转化为模型再训练数据的机制"
    ],
    "timeline": [
      [
        "2003",
        "DARPA programs formalize human-in-the-loop concepts for autonomous military systems"
      ],
      [
        "2016",
        "Tesla Autopilot incidents highlight the critical need for human oversight in AI-driven systems"
      ],
      [
        "2020",
        "Google's Monarch system implements HITL for large-scale ML model monitoring"
      ],
      [
        "2021",
        "Robert Munro publishes the definitive O'Reilly book on HITL machine learning"
      ],
      [
        "2024",
        "EU AI Act mandates human oversight for high-risk AI systems, making HITL a legal requirement"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "DARPA 项目为自主军事系统正式确立人在回路概念"
      ],
      [
        "2016",
        "特斯拉自动驾驶事故凸显 AI 驱动系统中人工监督的关键需求"
      ],
      [
        "2020",
        "Google 的 Monarch 系统为大规模 ML 模型监控实现人在回路"
      ],
      [
        "2021",
        "Robert Munro 出版关于人在回路机器学习的权威 O'Reilly 图书"
      ],
      [
        "2024",
        "欧盟 AI 法案强制要求高风险 AI 系统具备人工监督，使 HITL 成为法律要求"
      ]
    ],
    "dos": [
      "Design review UIs that show the AI's reasoning and confidence, not just the final decision",
      "Track override rates over time -- a rising rate signals model degradation or policy drift",
      "Set SLAs for human review turnaround to prevent HITL from becoming a bottleneck",
      "Use human corrections as high-quality labels to continuously retrain and improve the model"
    ],
    "dos_zh": [
      "设计审核界面时展示 AI 的推理过程和置信度，而非仅展示最终决策",
      "跟踪一段时间内的人工覆盖率——上升趋势意味着模型退化或政策漂移",
      "为人工审核设定 SLA 响应时限，防止 HITL 成为瓶颈",
      "将人工纠正作为高质量标注数据持续再训练和改进模型"
    ],
    "donts": [
      "Don't create review fatigue by sending too many low-risk items to humans -- it degrades review quality",
      "Don't skip the feedback loop -- human reviews that never improve the model are wasted effort",
      "Don't assume human reviewers are always correct -- build in inter-annotator agreement checks",
      "Don't make HITL an afterthought -- design it into the system architecture from the start"
    ],
    "donts_zh": [
      "不要将过多低风险项目发送至人工审核——审核疲劳会降低审核质量",
      "不要跳过反馈回路——从不改进模型的人工审核是浪费精力",
      "不要假设人工审核者永远正确——内置标注者一致性检查",
      "不要把 HITL 当作事后补救——从系统架构设计之初就纳入"
    ],
    "case_study_company": "Scale AI",
    "case_study": "Scale AI built its core business around human-in-the-loop data labeling for AI training. Their platform routes ML model predictions through expert human reviewers who correct labels, creating a virtuous cycle where each correction improves the next model iteration. This approach powers autonomous vehicle training data for Waymo, Toyota, and GM Cruise with 99.5%+ label accuracy.",
    "case_study_zh": "Scale AI 围绕人在回路数据标注构建核心业务。其平台将 ML 模型预测路由至专家人工审核者进行标签纠正，形成良性循环——每次纠正都改进下一次模型迭代。该方法为 Waymo、丰田和 GM Cruise 的自动驾驶训练数据提供 99.5% 以上的标注准确率。",
    "when_not_to_use": [
      "Fully autonomous real-time systems where human review latency is physically impossible (e.g., high-frequency trading)",
      "Low-risk, high-volume tasks where the cost of human review exceeds the cost of occasional errors",
      "Prototyping phases where rapid iteration matters more than production-grade oversight",
      "Tasks where model accuracy already exceeds human performance on the same task"
    ],
    "when_not_to_use_zh": [
      "人工审核延迟在物理上不可行的全自主实时系统（如高频交易）",
      "人工审核成本超过偶发错误成本的低风险高吞吐量任务",
      "快速迭代比生产级监督更重要的原型阶段",
      "模型在同一任务上准确率已超过人类表现的场景"
    ],
    "adopters": [
      "Scale AI",
      "Google (Monarch)",
      "Waymo",
      "Tesla",
      "Amazon Mechanical Turk / SageMaker Ground Truth"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "security"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Robert Munro (2021). \"Human-in-the-Loop Machine Learning: Active Learning and Annotation\". O'Reilly Media.",
    "secondary_sources": [
      "Burr Settles (2012). \"Active Learning\". Synthesis Lectures on Artificial Intelligence and Machine Learning, Morgan & Claypool.",
      "Saleema Amershi et al. (2014). \"Power to the People: The Role of Humans in Interactive Machine Learning\". AI Magazine, 35(4)."
    ],
    "typed_relations": [
      {
        "slug": "human-ai-interaction-design",
        "type": "complement"
      },
      {
        "slug": "guardrails-framework",
        "type": "complement"
      },
      {
        "slug": "responsible-ai-design",
        "type": "complement"
      }
    ]
  },
  {
    "id": 91,
    "name": "Prompt Chaining Pattern",
    "name_zh": "提示词链模式",
    "slug": "prompt-chaining",
    "category": "ai",
    "desc": "Decompose complex tasks into sequential prompt stages",
    "desc_zh": "将复杂任务分解为顺序执行的提示词阶段",
    "steps": [
      "Break the end-to-end task into discrete, independently testable prompt stages",
      "Define the output schema of each stage as the typed input of the next",
      "Validate and sanitize intermediate outputs before passing them downstream",
      "Add a gating step that checks quality criteria before continuing the chain",
      "Log each stage's input/output pair for debugging and prompt regression testing"
    ],
    "steps_zh": [
      "将端到端任务拆分为独立可测试的离散提示词阶段",
      "将每个阶段的输出模式定义为下一阶段的类型化输入",
      "在传递给下游前对中间输出进行验证和清洗",
      "添加质量门控步骤，满足标准后再继续执行链路",
      "记录每个阶段的输入输出对，用于调试与提示词回归测试"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Prompt 1",
      "Output 1",
      "Prompt 2",
      "Final Output"
    ],
    "viz_labels_zh": [
      "提示1",
      "输出1",
      "提示2",
      "最终输出"
    ],
    "related": [
      "react-framework",
      "prompt-engineering-patterns",
      "guardrails-framework"
    ],
    "tags": [
      "prompt-chaining",
      "pipeline",
      "decomposition",
      "sequential"
    ],
    "origin_author": "Harrison Chase (LangChain) / Wu Tongshuang et al., 2022",
    "origin_source": "AI Chains: Transparent and Controllable Human-AI Interaction by Chaining Large Language Model Prompts (CHI 2022)",
    "origin_source_zh": "AI Chains：通过链接大语言模型提示词实现透明可控的人机交互（CHI 2022）",
    "complexity": "beginner",
    "when_to_use": [
      "Complex generation tasks that benefit from separation into planning, drafting, and refinement stages",
      "Structured extraction pipelines where each stage narrows or enriches the data",
      "Content workflows (translate, summarize, then format) where each step has clear quality gates",
      "Any task where a single prompt fails to produce reliable output due to complexity"
    ],
    "when_to_use_zh": [
      "受益于拆分为规划、起草和优化阶段的复杂生成任务",
      "每个阶段逐步缩窄或丰富数据的结构化抽取管线",
      "每步有明确质量门控的内容工作流（翻译、摘要、排版）",
      "单一提示词因复杂度过高无法产出可靠输出的任何任务"
    ],
    "core_concepts": [
      "Stage: A single prompt call with a defined input schema and output schema",
      "Gate: A validation check between stages that decides whether to proceed, retry, or abort",
      "Schema Contract: The typed interface between stages ensuring one stage's output matches the next stage's expected input",
      "Chain Debugging: The practice of logging every stage's I/O to isolate where quality degrades",
      "Prompt Regression Testing: Running a fixed set of inputs through the chain to detect quality changes after prompt edits"
    ],
    "core_concepts_zh": [
      "阶段（Stage）：具有定义输入输出模式的单次提示词调用",
      "门控（Gate）：阶段间的验证检查，决定继续、重试还是终止",
      "模式契约（Schema Contract）：确保前一阶段输出匹配后一阶段期望输入的类型化接口",
      "链路调试（Chain Debugging）：记录每个阶段的输入输出以定位质量退化位置",
      "提示词回归测试：用固定输入集通过链路运行，在提示词修改后检测质量变化"
    ],
    "timeline": [
      [
        "2022-04",
        "Wu Tongshuang et al. publish AI Chains at CHI 2022, formalizing prompt chaining for interactive systems"
      ],
      [
        "2022-10",
        "LangChain launches with SequentialChain as a core abstraction for prompt pipelines"
      ],
      [
        "2023-06",
        "OpenAI Cookbook promotes prompt chaining as a best practice for complex tasks"
      ],
      [
        "2024-01",
        "DSPy introduces optimizable prompt pipelines with automatic prompt tuning across chain stages"
      ],
      [
        "2024-09",
        "Anthropic and OpenAI both recommend multi-step prompt architectures in their official guides"
      ]
    ],
    "timeline_zh": [
      [
        "2022-04",
        "Wu Tongshuang 等人在 CHI 2022 发表 AI Chains，正式定义交互系统的提示词链"
      ],
      [
        "2022-10",
        "LangChain 发布，以 SequentialChain 作为提示词管线的核心抽象"
      ],
      [
        "2023-06",
        "OpenAI Cookbook 将提示词链推广为复杂任务的最佳实践"
      ],
      [
        "2024-01",
        "DSPy 引入可优化的提示词管线，支持跨链路阶段的自动提示词调优"
      ],
      [
        "2024-09",
        "Anthropic 和 OpenAI 均在官方指南中推荐多步提示词架构"
      ]
    ],
    "dos": [
      "Keep each stage focused on a single transformation -- one stage, one job",
      "Define typed output schemas (JSON Schema or Pydantic) for every stage to catch format errors early",
      "Add retry logic with backoff at each stage rather than restarting the entire chain on failure",
      "Test each stage independently before assembling the full chain"
    ],
    "dos_zh": [
      "保持每个阶段专注于单一转换——一个阶段一个职责",
      "为每个阶段定义类型化输出模式（JSON Schema 或 Pydantic）以尽早捕获格式错误",
      "在每个阶段添加带退避的重试逻辑，而非失败时重启整条链路",
      "在组装完整链路前先独立测试每个阶段"
    ],
    "donts": [
      "Don't chain more than 5-7 stages -- longer chains accumulate errors and latency exponentially",
      "Don't pass raw unvalidated output between stages -- one bad stage poisons the entire downstream",
      "Don't ignore intermediate results -- they're your best debugging tool when the final output is wrong",
      "Don't over-engineer chains for tasks a single well-crafted prompt can handle"
    ],
    "donts_zh": [
      "不要链接超过 5-7 个阶段——更长的链路会指数级累积错误和延迟",
      "不要在阶段间传递未验证的原始输出——一个坏阶段会污染整个下游",
      "不要忽略中间结果——当最终输出有误时，它们是最好的调试工具",
      "不要为单个精心设计的提示词就能处理的任务过度工程化链路"
    ],
    "case_study_company": "Anthropic",
    "case_study": "Anthropic's internal content moderation pipeline uses a three-stage prompt chain: Stage 1 classifies content risk, Stage 2 extracts specific policy violations with citations, and Stage 3 generates a human-readable review summary. This chained approach improved classification accuracy by 15% over single-prompt methods and made the moderation reasoning auditable end-to-end.",
    "case_study_zh": "Anthropic 内部内容审核管线使用三阶段提示词链：阶段 1 分类内容风险，阶段 2 提取具体政策违规并引用依据，阶段 3 生成人类可读的审核摘要。链式方法比单提示词方法提升了 15% 的分类准确率，并使审核推理过程实现了端到端可审计。",
    "when_not_to_use": [
      "Simple single-turn tasks (classification, short extraction) where one prompt suffices",
      "Ultra-low-latency requirements where sequential LLM calls are too slow",
      "Exploratory or creative tasks where rigid stage boundaries constrain the model's output",
      "When the token cost of multiple calls significantly exceeds budget for the use case"
    ],
    "when_not_to_use_zh": [
      "单次提示词即可胜任的简单单轮任务（分类、短文抽取）",
      "顺序 LLM 调用过慢的超低延迟需求",
      "刚性阶段边界限制模型输出的探索性或创意任务",
      "多次调用的 Token 成本明显超出用例预算时"
    ],
    "adopters": [
      "Anthropic",
      "LangChain",
      "DSPy (Stanford)",
      "OpenAI",
      "Dust.tt"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Tongshuang Wu et al. (2022). \"AI Chains: Transparent and Controllable Human-AI Interaction by Chaining Large Language Model Prompts\". ACM CHI 2022.",
    "secondary_sources": [
      "Harrison Chase (2022). \"LangChain: Building Applications with LLMs through Composability\". github.com/langchain-ai.",
      "Ofir Press et al. (2022). \"Measuring and Narrowing the Compositionality Gap in Language Models\". arXiv:2210.03350."
    ],
    "typed_relations": [
      {
        "slug": "react-framework",
        "type": "complement"
      },
      {
        "slug": "prompt-engineering-patterns",
        "type": "complement"
      },
      {
        "slug": "guardrails-framework",
        "type": "complement"
      }
    ]
  },
  {
    "id": 92,
    "name": "AI Pair Programming Model",
    "name_zh": "AI 结对编程模型",
    "slug": "ai-pair-programming",
    "category": "ai",
    "desc": "Structure developer and AI collaboration in the coding loop",
    "desc_zh": "在编码循环中结构化人类开发者与 AI 的协作方式",
    "steps": [
      "Establish role division: human owns intent and review; AI owns synthesis and drafting",
      "Use intent-first prompting - describe what, why, and constraints before asking for code",
      "Review AI output for correctness, security, and style before accepting each chunk",
      "Feed failing tests or error messages back to the AI as grounding for refinement",
      "Reflect at session end: categorize AI contributions and identify skill gaps to address"
    ],
    "steps_zh": [
      "明确角色分工：人类负责意图与审查，AI 负责合成与起草",
      "使用意图优先提示——在请求代码前先描述目标、原因和约束",
      "在接受每段代码前审查 AI 输出的正确性、安全性和风格",
      "将失败测试或错误信息反馈给 AI 作为细化的依据",
      "在会话结束时复盘：归类 AI 贡献并识别需补足的技能差距"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Write",
      "Suggest",
      "Accept",
      "Iterate"
    ],
    "viz_labels_zh": [
      "编写代码",
      "AI建议",
      "采纳",
      "迭代"
    ],
    "related": [
      "human-ai-interaction-design",
      "ai-assisted-refactoring",
      "conventional-comments"
    ],
    "tags": [
      "pair-programming",
      "ai-collaboration",
      "coding",
      "developer-tools"
    ],
    "origin_author": "GitHub (Oege de Moor et al.), 2021",
    "origin_source": "GitHub Copilot Technical Preview and Evaluating Large Language Models Trained on Code (Codex, OpenAI, 2021)",
    "origin_source_zh": "GitHub Copilot 技术预览及评估在代码上训练的大语言模型（Codex，OpenAI，2021）",
    "complexity": "beginner",
    "when_to_use": [
      "Day-to-day coding tasks including boilerplate generation, test writing, and refactoring",
      "Onboarding to unfamiliar codebases where the AI can explain patterns and suggest idioms",
      "Prototyping and rapid iteration where speed of code generation matters more than perfection",
      "Learning new languages or frameworks by generating examples and explaining alternatives"
    ],
    "when_to_use_zh": [
      "日常编码任务，包括样板代码生成、测试编写和重构",
      "AI 能解释模式和建议惯用写法的不熟悉代码库入门阶段",
      "代码生成速度比完美更重要的原型构建和快速迭代",
      "通过生成示例和解释替代方案来学习新语言或框架"
    ],
    "core_concepts": [
      "Intent-First Prompting: Describing the goal and constraints before requesting code to guide generation quality",
      "Ghost Text: Inline AI suggestions that appear as the developer types, requiring accept/reject decisions",
      "Chat-Driven Development: Using a conversational AI sidebar to discuss, plan, and generate code interactively",
      "Agentic Coding: AI autonomously executing multi-step coding tasks (edit, test, fix) with human approval gates",
      "Skill Calibration: Regularly assessing which tasks the AI handles well vs. where human expertise is still critical"
    ],
    "core_concepts_zh": [
      "意图优先提示（Intent-First Prompting）：在请求代码前描述目标和约束以引导生成质量",
      "幽灵文本（Ghost Text）：开发者输入时内联出现的 AI 建议，需要接受或拒绝决策",
      "对话驱动开发（Chat-Driven Development）：使用对话式 AI 侧边栏交互式讨论、规划和生成代码",
      "代理式编码（Agentic Coding）：AI 自主执行多步编码任务（编辑、测试、修复），设人工审批门控",
      "技能校准（Skill Calibration）：定期评估 AI 擅长哪些任务、哪些仍需人类专长"
    ],
    "timeline": [
      [
        "2021-06",
        "GitHub launches Copilot technical preview, powered by OpenAI Codex"
      ],
      [
        "2022-06",
        "GitHub Copilot becomes generally available; Amazon announces CodeWhisperer"
      ],
      [
        "2023-03",
        "GPT-4 powers Copilot Chat, enabling conversational pair programming in the IDE"
      ],
      [
        "2024-03",
        "Devin (Cognition) and SWE-Agent demonstrate autonomous coding agents"
      ],
      [
        "2025-02",
        "Claude Code (Anthropic), Cursor, and Windsurf push agentic coding into mainstream developer workflows"
      ]
    ],
    "timeline_zh": [
      [
        "2021-06",
        "GitHub 发布由 OpenAI Codex 驱动的 Copilot 技术预览"
      ],
      [
        "2022-06",
        "GitHub Copilot 正式发布；亚马逊宣布 CodeWhisperer"
      ],
      [
        "2023-03",
        "GPT-4 驱动 Copilot Chat，在 IDE 中实现对话式结对编程"
      ],
      [
        "2024-03",
        "Devin（Cognition）和 SWE-Agent 展示自主编码代理"
      ],
      [
        "2025-02",
        "Claude Code（Anthropic）、Cursor 和 Windsurf 将代理式编码推入主流开发者工作流"
      ]
    ],
    "dos": [
      "Always review AI-generated code for security vulnerabilities -- AI can confidently produce insecure patterns",
      "Provide full context (file imports, types, test expectations) to get higher-quality suggestions",
      "Use AI for tedious tasks (boilerplate, tests, docs) and reserve your cognitive energy for architecture decisions",
      "Treat AI output as a first draft -- iterate and refine rather than accepting blindly"
    ],
    "dos_zh": [
      "始终审查 AI 生成代码的安全漏洞——AI 可能自信地产出不安全的模式",
      "提供完整上下文（文件导入、类型、测试期望）以获得更高质量的建议",
      "将 AI 用于繁琐任务（样板代码、测试、文档），将认知精力留给架构决策",
      "将 AI 输出视为初稿——迭代优化而非盲目接受"
    ],
    "donts": [
      "Don't accept AI code without understanding it -- you own the code quality and technical debt",
      "Don't rely on AI for security-critical code paths without independent review and testing",
      "Don't stop learning fundamentals because the AI can generate code -- understanding trumps generation",
      "Don't paste secrets or credentials into AI prompts -- they may be logged or used in training"
    ],
    "donts_zh": [
      "不要在不理解的情况下接受 AI 代码——代码质量和技术债务由你负责",
      "不要在没有独立审查和测试的情况下依赖 AI 处理安全关键代码路径",
      "不要因为 AI 能生成代码就停止学习基础知识——理解比生成更重要",
      "不要将密钥或凭证粘贴到 AI 提示词中——它们可能被记录或用于训练"
    ],
    "case_study_company": "GitHub",
    "case_study": "GitHub reported that Copilot users accepted approximately 30% of AI code suggestions and completed tasks 55% faster in controlled studies. By 2024, over 1.8 million paying developers and 77,000 organizations used Copilot, making it the fastest-adopted developer tool in history. The key insight was that autocomplete-style ghost text minimized workflow disruption.",
    "case_study_zh": "GitHub 报告 Copilot 用户接受了约 30% 的 AI 代码建议，在对照实验中完成任务速度提升 55%。截至 2024 年，超过 180 万付费开发者和 77,000 个组织使用 Copilot，使其成为历史上采用速度最快的开发者工具。关键洞察是自动补全式幽灵文本最小化了工作流干扰。",
    "when_not_to_use": [
      "Highly regulated codebases with strict IP provenance requirements that prohibit AI-generated code",
      "Security-sensitive cryptographic implementations where subtle AI errors could be catastrophic",
      "When the developer lacks sufficient understanding to review AI output critically",
      "Codebases with proprietary patterns so unique that generic AI models consistently produce incorrect suggestions"
    ],
    "when_not_to_use_zh": [
      "有严格知识产权溯源要求、禁止使用 AI 生成代码的高度受监管代码库",
      "AI 细微错误可能造成灾难性后果的安全敏感密码学实现",
      "开发者缺乏足够理解力来批判性审查 AI 输出时",
      "拥有独特专有模式、通用 AI 模型持续产出错误建议的代码库"
    ],
    "adopters": [
      "GitHub (Copilot)",
      "Anthropic (Claude Code)",
      "Cursor",
      "Google (Gemini Code Assist)",
      "Amazon (CodeWhisperer / Q Developer)"
    ],
    "abstraction_level": "code",
    "quality_concerns": [
      "maintainability",
      "usability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Mark Chen et al. (2021). \"Evaluating Large Language Models Trained on Code (Codex)\". arXiv:2107.03374.",
    "secondary_sources": [
      "Albert Ziegler et al. (2022). \"Productivity Assessment of Neural Code Completion\". MAPS 2022.",
      "Sida Peng et al. (2023). \"The Impact of AI on Developer Productivity: Evidence from GitHub Copilot\". arXiv:2302.06590."
    ],
    "typed_relations": [
      {
        "slug": "human-ai-interaction-design",
        "type": "complement"
      },
      {
        "slug": "ai-assisted-refactoring",
        "type": "complement"
      },
      {
        "slug": "conventional-comments",
        "type": "complement"
      }
    ]
  },
  {
    "id": 93,
    "name": "Guardrails Framework",
    "name_zh": "AI 护栏框架",
    "slug": "guardrails-framework",
    "category": "ai",
    "desc": "Enforce input/output constraints on LLM-powered systems",
    "desc_zh": "对大模型驱动系统强制执行输入输出约束",
    "steps": [
      "Enumerate risk categories relevant to the application (PII, toxicity, hallucination, injection)",
      "Implement input validators that reject or rewrite unsafe or malformed prompts",
      "Add output validators that check factual grounding, format compliance, and policy rules",
      "Route policy violations to fallback responses or human escalation queues",
      "Collect violation logs and retrain or tune guardrail classifiers on production data"
    ],
    "steps_zh": [
      "列举应用相关的风险类别（个人信息、有害内容、幻觉、注入攻击）",
      "实现输入验证器，拒绝或改写不安全或格式错误的提示词",
      "添加输出验证器，检查事实依据、格式合规性和策略规则",
      "将策略违规路由至降级响应或人工升级队列",
      "收集违规日志并在生产数据上重新训练或调整护栏分类器"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Input Guard",
      "LLM",
      "Output Guard",
      "Response"
    ],
    "viz_labels_zh": [
      "输入护栏",
      "大模型",
      "输出护栏",
      "响应"
    ],
    "related": [
      "responsible-ai-design",
      "ai-output-verification",
      "human-in-the-loop"
    ],
    "tags": [
      "guardrails",
      "safety",
      "validation",
      "constraints",
      "policy"
    ],
    "origin_author": "Shreya Rajpal (Guardrails AI), 2023; NVIDIA NeMo Guardrails team, 2023",
    "origin_source": "Guardrails AI open-source framework and NeMo Guardrails: A Toolkit for Controllable and Safe LLM Applications",
    "origin_source_zh": "Guardrails AI 开源框架及 NeMo Guardrails：可控安全大模型应用工具包",
    "complexity": "intermediate",
    "when_to_use": [
      "Customer-facing LLM applications where brand safety and content policy compliance are critical",
      "Systems handling sensitive data (PII, PHI, financial data) that must enforce data loss prevention",
      "Applications vulnerable to prompt injection attacks that need input sanitization layers",
      "Regulated deployments requiring auditable evidence that AI outputs comply with policy rules"
    ],
    "when_to_use_zh": [
      "品牌安全和内容政策合规至关重要的面客大模型应用",
      "处理敏感数据（PII、PHI、金融数据）、需要强制数据防泄露的系统",
      "易受提示词注入攻击、需要输入清洗层的应用",
      "需要 AI 输出符合政策规则的可审计证据的受监管部署"
    ],
    "core_concepts": [
      "Input Validator: A pre-processing layer that screens prompts for injection patterns, PII, or policy violations before they reach the model",
      "Output Validator: A post-processing layer that checks model responses for hallucination, toxicity, format errors, or off-topic content",
      "Fallback Response: A safe, pre-defined response returned when guardrails detect a violation",
      "Topical Rail: A constraint that keeps the model within a defined topic boundary and rejects off-topic queries",
      "Canary Token: A synthetic marker injected into context to detect if the model leaks system prompt content"
    ],
    "core_concepts_zh": [
      "输入验证器（Input Validator）：在提示词到达模型前筛查注入模式、PII 或政策违规的预处理层",
      "输出验证器（Output Validator）：检查模型响应是否存在幻觉、有害内容、格式错误或跑题的后处理层",
      "降级响应（Fallback Response）：护栏检测到违规时返回的安全预定义响应",
      "话题围栏（Topical Rail）：将模型限制在定义的主题边界内、拒绝跑题查询的约束",
      "金丝雀令牌（Canary Token）：注入上下文的合成标记，用于检测模型是否泄露系统提示词内容"
    ],
    "timeline": [
      [
        "2023-01",
        "Guardrails AI open-source library launches, introducing validator-based output checking"
      ],
      [
        "2023-04",
        "NVIDIA releases NeMo Guardrails with Colang, a domain-specific language for defining LLM rails"
      ],
      [
        "2023-07",
        "OWASP publishes the Top 10 for LLM Applications, galvanizing guardrails adoption"
      ],
      [
        "2023-11",
        "Anthropic introduces constitutional AI safety layers, a form of built-in guardrails"
      ],
      [
        "2024-06",
        "Guardrails Hub launches as a marketplace for community-contributed validators"
      ]
    ],
    "timeline_zh": [
      [
        "2023-01",
        "Guardrails AI 开源库发布，引入基于验证器的输出检查"
      ],
      [
        "2023-04",
        "NVIDIA 发布带 Colang 领域特定语言的 NeMo Guardrails 用于定义大模型围栏"
      ],
      [
        "2023-07",
        "OWASP 发布大模型应用十大风险，推动护栏框架广泛采用"
      ],
      [
        "2023-11",
        "Anthropic 引入 Constitutional AI 安全层——一种内置护栏形式"
      ],
      [
        "2024-06",
        "Guardrails Hub 作为社区贡献验证器的市场平台上线"
      ]
    ],
    "dos": [
      "Layer multiple guardrails (input + output + topical) -- no single check catches everything",
      "Log all guardrail triggers with full context for auditing and classifier improvement",
      "Test guardrails adversarially using red-team prompt injection datasets",
      "Keep guardrail latency under 200ms to avoid degrading user experience"
    ],
    "dos_zh": [
      "多层叠加护栏（输入 + 输出 + 话题）——没有单一检查能捕获所有问题",
      "记录所有护栏触发事件及完整上下文，用于审计和分类器改进",
      "使用红队提示词注入数据集对护栏进行对抗性测试",
      "将护栏延迟控制在 200ms 以内，避免降低用户体验"
    ],
    "donts": [
      "Don't rely solely on the LLM's own judgment for safety -- it can be jailbroken or misled",
      "Don't hardcode guardrail rules -- policies change and guardrails need to be updatable without redeployment",
      "Don't block legitimate queries with overly aggressive rules -- high false-positive rates erode user trust",
      "Don't skip input validation assuming the UI already sanitizes -- APIs can be called directly"
    ],
    "donts_zh": [
      "不要仅依赖大模型自身判断来保障安全——它可能被越狱或误导",
      "不要硬编码护栏规则——政策会变，护栏需要无需重新部署即可更新",
      "不要用过于激进的规则阻止合法查询——高误报率会侵蚀用户信任",
      "不要因假设 UI 已经做了清洗就跳过输入验证——API 可以被直接调用"
    ],
    "case_study_company": "NVIDIA",
    "case_study": "NVIDIA developed NeMo Guardrails to secure enterprise LLM deployments across its customer base. Using Colang, a purpose-built DSL, enterprises define conversational rails that prevent topic drift, data leakage, and prompt injection. Early adopters reported a 90% reduction in off-topic model responses and were able to deploy customer-facing chatbots that previously failed compliance review.",
    "case_study_zh": "NVIDIA 开发 NeMo Guardrails 为企业客户的大模型部署提供安全保障。企业使用专门构建的 DSL Colang 定义对话围栏，防止话题偏离、数据泄露和提示词注入。早期采用者报告跑题响应减少 90%，并成功部署了此前未通过合规审查的面客聊天机器人。",
    "when_not_to_use": [
      "Internal research or experimentation environments where safety constraints hinder exploration",
      "Creative writing applications where content policy guardrails may stifle legitimate artistic expression",
      "Performance-critical streaming use cases where any added latency is unacceptable",
      "When the LLM is operating in a fully sandboxed environment with no external-facing output"
    ],
    "when_not_to_use_zh": [
      "安全约束阻碍探索的内部研究或实验环境",
      "内容策略护栏可能抑制合法艺术表达的创意写作应用",
      "任何额外延迟都不可接受的性能关键型流式用例",
      "大模型在完全沙箱环境中运行且无外部可见输出时"
    ],
    "adopters": [
      "NVIDIA (NeMo Guardrails)",
      "Guardrails AI",
      "Microsoft (Azure AI Content Safety)",
      "Anthropic",
      "OpenAI (Moderation API)"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "security",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Shreya Rajpal (2023). \"Guardrails AI: Adding Guardrails to Large Language Models\". github.com/guardrails-ai.",
    "secondary_sources": [
      "NVIDIA (2023). \"NeMo Guardrails: A Toolkit for Controllable and Safe LLM Applications with Programmable Rails\". arXiv:2310.10501.",
      "Yuntao Bai et al. (2022). \"Constitutional AI: Harmlessness from AI Feedback\". arXiv:2212.08073. Anthropic."
    ],
    "typed_relations": [
      {
        "slug": "responsible-ai-design",
        "type": "complement"
      },
      {
        "slug": "ai-output-verification",
        "type": "complement"
      },
      {
        "slug": "human-in-the-loop",
        "type": "complement"
      }
    ]
  },
  {
    "id": 94,
    "name": "Context Window Management Pattern",
    "name_zh": "上下文窗口管理模式",
    "slug": "context-window-management",
    "category": "ai",
    "desc": "Strategically manage LLM context to maximize coherence",
    "desc_zh": "战略性管理大模型上下文以最大化对话连贯性",
    "steps": [
      "Profile token usage across system prompt, history, retrieved context, and output budget",
      "Implement a sliding-window or summary-compression strategy for long conversation history",
      "Assign priority tiers to context segments (system > recent turns > retrieved > background)",
      "Prune low-priority segments first when approaching the token limit",
      "Benchmark coherence and task performance across window sizes to calibrate the strategy"
    ],
    "steps_zh": [
      "分析系统提示词、历史记录、检索上下文和输出预算各自的 Token 占用",
      "对长对话历史实现滑动窗口或摘要压缩策略",
      "为上下文片段分配优先级层级（系统 > 近期轮次 > 检索内容 > 背景知识）",
      "接近 Token 上限时优先裁剪低优先级片段",
      "在不同窗口大小下基准测试连贯性和任务性能以校准策略"
    ],
    "ai_relevant": true,
    "viz_type": "pyramid",
    "viz_labels": [
      "System",
      "History",
      "Retrieved",
      "User Input"
    ],
    "viz_labels_zh": [
      "系统提示",
      "历史对话",
      "检索内容",
      "用户输入"
    ],
    "related": [
      "rag-architecture",
      "prompt-chaining",
      "llm-system-design-patterns"
    ],
    "tags": [
      "context-window",
      "token-management",
      "compression",
      "memory"
    ],
    "origin_author": "Community-evolved pattern; formalized by LangChain (ConversationBufferWindowMemory, 2022) and Anthropic (long context research, 2023)",
    "origin_source": "Lost in the Middle: How Language Models Use Long Contexts (Nelson Liu et al., Stanford, 2023)",
    "origin_source_zh": "迷失在中间：语言模型如何使用长上下文（Nelson Liu 等，斯坦福，2023）",
    "complexity": "intermediate",
    "when_to_use": [
      "Multi-turn conversational agents where history accumulates beyond the context window limit",
      "RAG systems that must balance retrieved context against conversation history and system prompt",
      "Complex agentic workflows where tool outputs and reasoning traces fill the context rapidly",
      "Cost-sensitive applications where reducing token usage directly reduces API spend"
    ],
    "when_to_use_zh": [
      "历史记录累积超出上下文窗口限制的多轮对话代理",
      "需在检索上下文、对话历史和系统提示词间取得平衡的 RAG 系统",
      "工具输出和推理轨迹快速填满上下文的复杂代理工作流",
      "减少 Token 使用直接降低 API 费用的成本敏感应用"
    ],
    "core_concepts": [
      "Token Budget: The allocation of context window capacity across system prompt, history, retrieval, and output reserve",
      "Sliding Window: Keeping only the N most recent conversation turns and discarding older ones",
      "Summary Compression: Using an LLM to condense older conversation history into a shorter summary",
      "Priority Tiering: Ranking context segments by importance to decide eviction order when space runs out",
      "Lost-in-the-Middle Effect: The empirical finding that models attend less to information placed in the middle of long contexts"
    ],
    "core_concepts_zh": [
      "Token 预算（Token Budget）：在系统提示词、历史记录、检索内容和输出预留间分配上下文窗口容量",
      "滑动窗口（Sliding Window）：仅保留最近 N 轮对话，丢弃更早的内容",
      "摘要压缩（Summary Compression）：使用 LLM 将较早的对话历史浓缩为更短的摘要",
      "优先级分层（Priority Tiering）：按重要性排列上下文片段以决定空间不足时的淘汰顺序",
      "中间迷失效应（Lost-in-the-Middle）：模型对长上下文中间位置信息关注度较低的实证发现"
    ],
    "timeline": [
      [
        "2022-10",
        "LangChain releases ConversationBufferWindowMemory and ConversationSummaryMemory abstractions"
      ],
      [
        "2023-03",
        "GPT-4 launches with 8K/32K context windows, making context management a mainstream concern"
      ],
      [
        "2023-07",
        "Nelson Liu et al. publish Lost in the Middle, revealing positional bias in long contexts"
      ],
      [
        "2024-02",
        "Google Gemini 1.5 Pro introduces 1M token context, shifting the tradeoff calculus"
      ],
      [
        "2024-11",
        "Anthropic Claude supports 200K tokens natively; context management shifts from hard limits to cost optimization"
      ]
    ],
    "timeline_zh": [
      [
        "2022-10",
        "LangChain 发布 ConversationBufferWindowMemory 和 ConversationSummaryMemory 抽象"
      ],
      [
        "2023-03",
        "GPT-4 发布 8K/32K 上下文窗口，上下文管理成为主流关注点"
      ],
      [
        "2023-07",
        "Nelson Liu 等人发表《迷失在中间》，揭示长上下文中的位置偏差"
      ],
      [
        "2024-02",
        "Google Gemini 1.5 Pro 引入 100 万 Token 上下文，改变取舍计算逻辑"
      ],
      [
        "2024-11",
        "Anthropic Claude 原生支持 20 万 Token；上下文管理从硬限制转向成本优化"
      ]
    ],
    "dos": [
      "Always reserve output token budget -- filling 100% of context leaves no room for the response",
      "Place the most important information at the beginning and end of the context (avoid the lost-in-the-middle zone)",
      "Measure actual token usage in production to understand real allocation patterns before optimizing",
      "Use structured metadata tags to mark context segments so they can be selectively pruned"
    ],
    "dos_zh": [
      "始终预留输出 Token 预算——填满 100% 上下文后响应没有空间",
      "将最重要信息放在上下文开头和末尾（避开中间迷失区域）",
      "在优化前先测量生产环境中的实际 Token 使用量以了解真实分配模式",
      "使用结构化元数据标签标记上下文片段以支持选择性裁剪"
    ],
    "donts": [
      "Don't assume larger context windows eliminate the need for management -- cost and attention degradation still apply",
      "Don't summarize aggressively without preserving key facts -- lossy compression can cause hallucination",
      "Don't treat all conversation turns as equal priority -- recent turns and system prompts matter most",
      "Don't ignore the cost implications -- 100K context calls are 10x more expensive than 10K calls"
    ],
    "donts_zh": [
      "不要假设更大的上下文窗口就不需要管理——成本和注意力退化仍然存在",
      "不要在不保留关键事实的情况下激进摘要——有损压缩可能导致幻觉",
      "不要将所有对话轮次视为等优先级——近期轮次和系统提示词最重要",
      "不要忽视成本影响——100K 上下文调用比 10K 调用贵 10 倍"
    ],
    "case_study_company": "ChatGPT (OpenAI)",
    "case_study": "OpenAI's ChatGPT implements a sophisticated context management strategy combining sliding window history with automatic summarization. When conversations exceed the context limit, earlier turns are compressed into summaries while the system prompt and recent messages are preserved at full fidelity. This approach enabled ChatGPT to maintain coherent multi-hour conversations while keeping inference costs predictable.",
    "case_study_zh": "OpenAI 的 ChatGPT 实现了结合滑动窗口历史和自动摘要的精密上下文管理策略。当对话超出上下文限制时，较早的轮次被压缩为摘要，同时系统提示词和近期消息以全保真度保留。该方法使 ChatGPT 能在保持推理成本可预测的同时维持数小时的连贯对话。",
    "when_not_to_use": [
      "Short, single-turn interactions that never approach context limits",
      "Use cases where the entire relevant context easily fits within the window with room to spare",
      "Offline batch processing where context can be partitioned rather than compressed",
      "When using models with extremely large native windows (1M+ tokens) and cost is not a concern"
    ],
    "when_not_to_use_zh": [
      "永远不会接近上下文限制的短单轮交互",
      "全部相关上下文轻松放入窗口且有富余空间的用例",
      "上下文可以分区而非压缩的离线批处理",
      "使用超大原生窗口模型（100 万+ Token）且成本不是问题时"
    ],
    "adopters": [
      "OpenAI (ChatGPT)",
      "Anthropic (Claude)",
      "LangChain",
      "Google (Gemini)",
      "Cohere"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "performance",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Nelson Liu et al. (2023). \"Lost in the Middle: How Language Models Use Long Contexts\". arXiv:2307.03172.",
    "secondary_sources": [
      "Anthropic (2023). \"Long Context Prompting for Claude\". docs.anthropic.com.",
      "Iz Beltagy, Matthew E. Peters, and Arman Cohan (2020). \"Longformer: The Long-Document Transformer\". arXiv:2004.05150."
    ],
    "typed_relations": [
      {
        "slug": "rag-architecture",
        "type": "complement"
      },
      {
        "slug": "prompt-chaining",
        "type": "complement"
      },
      {
        "slug": "llm-system-design-patterns",
        "type": "related"
      }
    ]
  },
  {
    "id": 95,
    "name": "Tool-Use Design Pattern",
    "name_zh": "工具使用设计模式",
    "slug": "tool-use-design-pattern",
    "category": "ai",
    "desc": "Design agent-callable tools with reliable interfaces",
    "desc_zh": "为 AI 代理设计可靠可调用的工具接口",
    "steps": [
      "Define each tool as a typed function with a machine-readable JSON Schema description",
      "Write tool descriptions from the model's perspective - emphasize when to use and not use",
      "Implement idempotent tool handlers with deterministic error messages the model can act on",
      "Add a tool-selection evaluation harness to measure call accuracy on representative queries",
      "Version and deprecate tools explicitly, updating descriptions to guide migration"
    ],
    "steps_zh": [
      "将每个工具定义为带有机器可读 JSON Schema 描述的类型化函数",
      "从模型视角撰写工具描述——强调何时使用以及何时不应使用",
      "实现幂等的工具处理器，返回模型可据此行动的确定性错误信息",
      "构建工具选择评估框架，在代表性查询上度量调用准确率",
      "显式对工具进行版本管理和废弃标注，更新描述以引导迁移"
    ],
    "ai_relevant": true,
    "viz_type": "tree",
    "viz_labels": [
      "Agent",
      "Tool Router",
      "Tool A",
      "Tool B"
    ],
    "viz_labels_zh": [
      "Agent",
      "工具路由",
      "工具A",
      "工具B"
    ],
    "related": [
      "react-framework",
      "ai-first-api-design",
      "tool-use-react-pattern"
    ],
    "tags": [
      "tool-use",
      "function-calling",
      "api-design",
      "agent-tools"
    ],
    "origin_author": "Timo Schick et al. (Meta AI), 2023; OpenAI Function Calling team, 2023",
    "origin_source": "Toolformer: Language Models Can Teach Themselves to Use Tools (NeurIPS 2023) and OpenAI Function Calling API",
    "origin_source_zh": "Toolformer：语言模型可以自学使用工具（NeurIPS 2023）及 OpenAI 函数调用 API",
    "complexity": "intermediate",
    "when_to_use": [
      "Building agents that interact with external systems (databases, APIs, file systems, browsers)",
      "Tasks requiring capabilities the LLM doesn't have natively (math computation, code execution, web search)",
      "Production systems where tool invocations must be reliable, logged, and auditable",
      "Platforms exposing a growing catalog of capabilities to AI agents (plugin systems, MCP servers)"
    ],
    "when_to_use_zh": [
      "构建与外部系统（数据库、API、文件系统、浏览器）交互的代理",
      "需要大模型原生不具备能力的任务（数学计算、代码执行、网页搜索）",
      "工具调用必须可靠、可记录和可审计的生产系统",
      "向 AI 代理暴露不断增长能力目录的平台（插件系统、MCP 服务器）"
    ],
    "core_concepts": [
      "Function Schema: A JSON Schema definition describing a tool's name, parameters, types, and purpose for the model to parse",
      "Tool Description: The natural-language explanation of when and why to use a tool, written from the model's perspective",
      "Idempotency: Designing tool handlers so repeated calls with the same parameters produce the same result without side effects",
      "Parallel Tool Calling: The ability for models to emit multiple tool calls in a single turn to reduce round-trips",
      "Model Context Protocol (MCP): An emerging standard for exposing tools to AI agents via a unified server interface"
    ],
    "core_concepts_zh": [
      "函数模式（Function Schema）：描述工具名称、参数、类型和用途的 JSON Schema 定义，供模型解析",
      "工具描述（Tool Description）：从模型视角撰写的关于何时及为何使用工具的自然语言说明",
      "幂等性（Idempotency）：设计工具处理器使相同参数的重复调用产生相同结果且无副作用",
      "并行工具调用（Parallel Tool Calling）：模型在单轮中发出多个工具调用以减少往返次数的能力",
      "模型上下文协议（MCP）：通过统一服务器接口向 AI 代理暴露工具的新兴标准"
    ],
    "timeline": [
      [
        "2023-02",
        "Timo Schick et al. publish Toolformer, showing LLMs can self-learn tool usage"
      ],
      [
        "2023-06",
        "OpenAI launches Function Calling API, making structured tool use mainstream"
      ],
      [
        "2023-11",
        "Anthropic introduces tool use (function calling) for Claude models"
      ],
      [
        "2024-04",
        "Anthropic publishes the Model Context Protocol (MCP), an open standard for tool servers"
      ],
      [
        "2024-12",
        "MCP gains wide adoption; major IDEs and agent frameworks integrate MCP tool servers"
      ]
    ],
    "timeline_zh": [
      [
        "2023-02",
        "Timo Schick 等人发表 Toolformer，展示大模型可自学工具使用"
      ],
      [
        "2023-06",
        "OpenAI 发布函数调用 API，使结构化工具使用成为主流"
      ],
      [
        "2023-11",
        "Anthropic 为 Claude 模型引入工具使用（函数调用）功能"
      ],
      [
        "2024-04",
        "Anthropic 发布模型上下文协议（MCP），工具服务器的开放标准"
      ],
      [
        "2024-12",
        "MCP 获得广泛采用；主要 IDE 和代理框架集成 MCP 工具服务器"
      ]
    ],
    "dos": [
      "Write tool descriptions that include both when-to-use and when-NOT-to-use guidance for the model",
      "Return structured error messages that tell the model what went wrong and how to fix the call",
      "Keep the tool catalog focused -- fewer, well-described tools outperform many vaguely described ones",
      "Test tool selection accuracy separately from tool execution -- they are distinct failure modes"
    ],
    "dos_zh": [
      "工具描述中同时包含何时使用和何时不应使用的指导",
      "返回结构化错误信息，告诉模型出了什么问题以及如何修正调用",
      "保持工具目录精简——少量描述清晰的工具优于大量描述模糊的工具",
      "将工具选择准确率与工具执行分开测试——它们是不同的故障模式"
    ],
    "donts": [
      "Don't use ambiguous parameter names like 'data' or 'input' -- use semantically clear names like 'search_query'",
      "Don't expose destructive operations (delete, overwrite) without confirmation mechanisms",
      "Don't return large unstructured blobs from tools -- the model wastes context parsing irrelevant output",
      "Don't assume the model will always call tools correctly -- validate parameters server-side"
    ],
    "donts_zh": [
      "不要使用模糊参数名如 data 或 input——使用语义清晰的名称如 search_query",
      "不要在没有确认机制的情况下暴露破坏性操作（删除、覆写）",
      "不要从工具返回大型非结构化数据——模型会浪费上下文解析无关输出",
      "不要假设模型总能正确调用工具——在服务端验证参数"
    ],
    "case_study_company": "OpenAI",
    "case_study": "OpenAI's Function Calling API transformed how developers build tool-using agents. By standardizing the JSON Schema format for tool definitions, OpenAI enabled a plugin ecosystem where thousands of third-party tools could be exposed to GPT models. The ChatGPT plugin store (later replaced by GPTs) demonstrated that well-designed tool schemas could enable models to orchestrate complex workflows spanning search, computation, and external APIs.",
    "case_study_zh": "OpenAI 的函数调用 API 变革了开发者构建工具使用代理的方式。通过标准化工具定义的 JSON Schema 格式，OpenAI 构建了一个插件生态系统，使数千个第三方工具能被 GPT 模型调用。ChatGPT 插件商店（后被 GPTs 取代）证明了精心设计的工具模式能使模型编排涵盖搜索、计算和外部 API 的复杂工作流。",
    "when_not_to_use": [
      "Tasks the model can complete entirely from its training knowledge without external data",
      "When tool invocation adds latency that degrades the user experience for simple queries",
      "Environments where external API calls are blocked by network security policies",
      "Prototypes where hardcoded responses are sufficient and tool infrastructure is overhead"
    ],
    "when_not_to_use_zh": [
      "模型完全可以从训练知识中完成、不需要外部数据的任务",
      "工具调用增加的延迟会降低简单查询用户体验时",
      "网络安全策略阻止外部 API 调用的环境",
      "硬编码响应即可满足需求、工具基础设施是额外负担的原型阶段"
    ],
    "adopters": [
      "OpenAI",
      "Anthropic (Claude / MCP)",
      "Google (Gemini Function Calling)",
      "LangChain",
      "Vercel AI SDK"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability",
      "usability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Timo Schick et al. (2023). \"Toolformer: Language Models Can Teach Themselves to Use Tools\". NeurIPS 2023.",
    "secondary_sources": [
      "OpenAI (2023). \"Function Calling and Other API Updates\". openai.com/blog.",
      "Yujia Qin et al. (2023). \"Tool Learning with Foundation Models\". arXiv:2304.08354."
    ],
    "typed_relations": [
      {
        "slug": "react-framework",
        "type": "complement"
      },
      {
        "slug": "ai-first-api-design",
        "type": "complement"
      },
      {
        "slug": "tool-use-react-pattern",
        "type": "extends"
      }
    ]
  },
  {
    "id": 96,
    "name": "AI-First API Design",
    "name_zh": "AI 优先 API 设计",
    "slug": "ai-first-api-design",
    "category": "ai",
    "desc": "Design APIs optimized for consumption by AI agents",
    "desc_zh": "面向 AI 代理消费优化设计 API 接口",
    "steps": [
      "Expose semantic resource names and self-describing endpoints (OpenAPI + rich descriptions)",
      "Return structured, schema-validated JSON responses that avoid ambiguous free-text fields",
      "Provide bulk and idempotent operations to reduce agent round-trips and retries",
      "Embed task-relevant metadata (next actions, related resources) in responses as hints",
      "Publish a machine-readable capability manifest so agents can discover and compose APIs"
    ],
    "steps_zh": [
      "暴露语义化资源名称和自描述端点（OpenAPI + 丰富描述）",
      "返回模式验证的结构化 JSON，避免歧义自由文本字段",
      "提供批量操作和幂等操作以减少代理往返请求和重试",
      "在响应中嵌入任务相关元数据（后续动作、关联资源）作为提示",
      "发布机器可读的能力清单，使代理能够发现和组合 API"
    ],
    "ai_relevant": true,
    "viz_type": "matrix",
    "viz_labels": [
      "Schema",
      "Context",
      "Actions",
      "Auth"
    ],
    "viz_labels_zh": [
      "模式定义",
      "上下文",
      "可执行动作",
      "鉴权"
    ],
    "related": [
      "tool-use-design-pattern",
      "richardson-maturity-model",
      "agent-communication-protocol"
    ],
    "tags": [
      "api-design",
      "ai-first",
      "machine-readable",
      "agent-friendly"
    ],
    "origin_author": "Community-evolved pattern; influenced by Anthropic MCP (2024), OpenAI Plugins (2023), and Stripe API design philosophy",
    "origin_source": "Model Context Protocol specification (Anthropic, 2024) and API design guidelines from OpenAI, Stripe, and Twilio",
    "origin_source_zh": "模型上下文协议规范（Anthropic，2024）及 OpenAI、Stripe、Twilio 的 API 设计指南",
    "complexity": "advanced",
    "when_to_use": [
      "Building APIs that will be consumed by LLM agents via function calling or MCP",
      "Creating internal platform APIs where AI assistants will be primary consumers alongside humans",
      "Designing microservice interfaces in systems where AI orchestrators route between services",
      "Migrating existing APIs to be compatible with the AI agent ecosystem"
    ],
    "when_to_use_zh": [
      "构建将被大模型代理通过函数调用或 MCP 消费的 API",
      "创建 AI 助手将与人类一起作为主要消费者的内部平台 API",
      "在 AI 编排器在服务间路由的系统中设计微服务接口",
      "将现有 API 迁移为与 AI 代理生态系统兼容"
    ],
    "core_concepts": [
      "Self-Describing Endpoint: An API endpoint with rich OpenAPI descriptions that an agent can understand without external documentation",
      "Capability Manifest: A machine-readable document (like MCP server config) that advertises what an API can do",
      "Semantic Resource Naming: Using clear, descriptive resource names (/invoices, /customers) instead of opaque IDs or abbreviations",
      "Affordance Embedding: Including next-possible-actions or HATEOAS-style links in API responses to guide agent navigation",
      "Error Actionability: Returning structured error responses that tell the agent exactly what to fix (missing field, invalid format, etc.)"
    ],
    "core_concepts_zh": [
      "自描述端点（Self-Describing Endpoint）：具有丰富 OpenAPI 描述、代理无需外部文档即可理解的 API 端点",
      "能力清单（Capability Manifest）：广播 API 功能的机器可读文档（如 MCP 服务器配置）",
      "语义资源命名（Semantic Resource Naming）：使用清晰描述性的资源名称（/invoices、/customers）而非不透明 ID 或缩写",
      "功能可见性嵌入（Affordance Embedding）：在 API 响应中包含下一步可能操作或 HATEOAS 风格链接以引导代理导航",
      "可操作错误（Error Actionability）：返回结构化错误响应，准确告知代理需要修正什么（缺失字段、无效格式等）"
    ],
    "timeline": [
      [
        "2023-03",
        "OpenAI launches ChatGPT Plugins, requiring APIs to expose OpenAPI specs for agent consumption"
      ],
      [
        "2023-06",
        "OpenAI Function Calling formalizes JSON Schema as the interface between models and APIs"
      ],
      [
        "2024-04",
        "Anthropic publishes Model Context Protocol (MCP), an open standard for AI-consumable tool servers"
      ],
      [
        "2024-09",
        "Major SaaS platforms (Stripe, Shopify, Salesforce) begin publishing MCP-compatible API endpoints"
      ],
      [
        "2025-01",
        "AI-first API design becomes a recognized discipline as agent-to-agent API traffic grows"
      ]
    ],
    "timeline_zh": [
      [
        "2023-03",
        "OpenAI 发布 ChatGPT 插件，要求 API 暴露 OpenAPI 规范供代理消费"
      ],
      [
        "2023-06",
        "OpenAI 函数调用将 JSON Schema 确立为模型与 API 之间的接口"
      ],
      [
        "2024-04",
        "Anthropic 发布模型上下文协议（MCP），AI 可消费工具服务器的开放标准"
      ],
      [
        "2024-09",
        "主要 SaaS 平台（Stripe、Shopify、Salesforce）开始发布 MCP 兼容的 API 端点"
      ],
      [
        "2025-01",
        "随着代理间 API 流量增长，AI 优先 API 设计成为一门公认的学科"
      ]
    ],
    "dos": [
      "Write API descriptions as if the consumer has zero prior context -- agents don't read README files",
      "Return consistent, typed response schemas so agents can reliably parse without error handling hacks",
      "Include pagination metadata and total counts in list responses for agents to plan their retrieval",
      "Provide sandbox/test environments where agents can safely explore API capabilities"
    ],
    "dos_zh": [
      "撰写 API 描述时假设消费者零上下文——代理不会阅读 README 文件",
      "返回一致的类型化响应模式，使代理能可靠解析而无需错误处理补丁",
      "在列表响应中包含分页元数据和总数，供代理规划检索策略",
      "提供沙箱/测试环境供代理安全探索 API 能力"
    ],
    "donts": [
      "Don't return HTML or unstructured text in API responses -- agents need structured JSON to act on",
      "Don't require multi-step authentication flows that agents can't navigate programmatically",
      "Don't use inconsistent naming conventions across endpoints -- agents rely on naming patterns for discovery",
      "Don't embed critical information only in documentation -- put it in the OpenAPI spec where agents can see it"
    ],
    "donts_zh": [
      "不要在 API 响应中返回 HTML 或非结构化文本——代理需要结构化 JSON 来操作",
      "不要要求代理无法程序化导航的多步认证流程",
      "不要在端点间使用不一致的命名约定——代理依赖命名模式进行发现",
      "不要将关键信息仅放在文档中——放入 OpenAPI 规范让代理可以看到"
    ],
    "case_study_company": "Stripe",
    "case_study": "Stripe's API has been widely cited as the gold standard for AI-consumable API design. Its consistent resource naming, rich error messages with fix suggestions, and comprehensive OpenAPI specification made it one of the first APIs that LLM agents could navigate effectively. When ChatGPT Plugins launched, Stripe was among the first integrations, and agents could create payment links, manage subscriptions, and issue refunds with minimal prompt engineering.",
    "case_study_zh": "Stripe 的 API 被广泛认为是 AI 可消费 API 设计的黄金标准。其一致的资源命名、包含修复建议的丰富错误信息和全面的 OpenAPI 规范，使其成为最早能被大模型代理有效导航的 API 之一。当 ChatGPT 插件推出时，Stripe 是首批集成之一，代理能以极少的提示词工程创建支付链接、管理订阅和处理退款。",
    "when_not_to_use": [
      "Internal microservices that will only ever be consumed by other services, not AI agents",
      "High-performance binary protocols (gRPC, Protocol Buffers) where human/AI readability is traded for speed",
      "Legacy systems with no feasible path to adding rich descriptions or structured schemas",
      "APIs that handle only human-to-human workflows with no AI agent interaction"
    ],
    "when_not_to_use_zh": [
      "仅会被其他服务而非 AI 代理消费的内部微服务",
      "以可读性换取速度的高性能二进制协议（gRPC、Protocol Buffers）",
      "无法添加丰富描述或结构化模式的遗留系统",
      "仅处理人与人工作流、没有 AI 代理交互的 API"
    ],
    "adopters": [
      "Stripe",
      "Shopify",
      "Anthropic (MCP)",
      "Twilio",
      "Salesforce"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "usability",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Anthropic (2024). \"Model Context Protocol Specification\". modelcontextprotocol.io.",
    "secondary_sources": [
      "OpenAI (2023). \"ChatGPT Plugins\". openai.com/blog.",
      "JJ Geewax (2021). \"API Design Patterns\". Manning Publications."
    ],
    "typed_relations": [
      {
        "slug": "tool-use-design-pattern",
        "type": "complement"
      },
      {
        "slug": "richardson-maturity-model",
        "type": "extends"
      },
      {
        "slug": "agent-communication-protocol",
        "type": "complement"
      }
    ]
  },
  {
    "id": 97,
    "name": "Self-Healing Systems Pattern",
    "name_zh": "自愈系统模式",
    "slug": "self-healing-systems",
    "category": "ai",
    "desc": "Use AI agents to detect, diagnose, and remediate failures",
    "desc_zh": "利用 AI 代理自动检测、诊断并修复系统故障",
    "steps": [
      "Instrument the system with rich structured logs, metrics, and health-check endpoints",
      "Deploy a monitoring agent that correlates signals and classifies failure modes",
      "Maintain a runbook knowledge base the agent uses to select remediation actions",
      "Execute low-risk remediations automatically; escalate high-risk ones for human approval",
      "Record every incident-action-outcome triple to continuously improve the runbook"
    ],
    "steps_zh": [
      "为系统配备丰富的结构化日志、指标和健康检查端点",
      "部署监控代理，关联信号并分类故障模式",
      "维护运维手册知识库供代理选择修复动作",
      "自动执行低风险修复操作；将高风险操作升级至人工审批",
      "记录每次故障-动作-结果三元组，持续完善运维手册"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Monitor",
      "Detect",
      "Diagnose",
      "Remediate"
    ],
    "viz_labels_zh": [
      "监控",
      "检测",
      "诊断",
      "自动修复"
    ],
    "related": [
      "chaos-engineering",
      "circuit-breaker-pattern",
      "agent-reliability-patterns"
    ],
    "tags": [
      "self-healing",
      "auto-remediation",
      "monitoring",
      "runbook"
    ],
    "origin_author": "IBM (Autonomic Computing Manifesto), 2001; modern AI-ops formalized by PagerDuty and Shoreline.io, 2022",
    "origin_source": "The Vision of Autonomic Computing (Jeffrey Kephart & David Chess, IBM, 2003) and AIOps: Real-World Challenges and Research Innovations",
    "origin_source_zh": "自主计算愿景（Jeffrey Kephart & David Chess，IBM，2003）及 AIOps：真实世界挑战与研究创新",
    "complexity": "advanced",
    "when_to_use": [
      "Large-scale distributed systems where manual incident response cannot keep pace with failure frequency",
      "Cloud-native environments with well-instrumented observability stacks (metrics, logs, traces)",
      "On-call rotation optimization -- automating known, low-risk fixes to reduce alert fatigue",
      "Infrastructure with predictable failure patterns (disk full, OOM, certificate expiry) suited to automated runbooks"
    ],
    "when_to_use_zh": [
      "手动事故响应无法跟上故障频率的大规模分布式系统",
      "具备完善可观测性技术栈（指标、日志、追踪）的云原生环境",
      "值班轮岗优化——自动化已知低风险修复以减少告警疲劳",
      "具有可预测故障模式（磁盘满、OOM、证书过期）、适合自动化运维手册的基础设施"
    ],
    "core_concepts": [
      "Autonomic Loop: The self-healing cycle of Monitor, Analyze, Plan, Execute (MAPE-K) from IBM's autonomic computing vision",
      "Runbook Automation: Codified operational procedures that an AI agent can execute to resolve known failure classes",
      "Signal Correlation: Aggregating metrics, logs, and traces to identify root causes rather than reacting to individual alerts",
      "Blast Radius Assessment: Evaluating the potential impact of an automated remediation action before executing it",
      "Incident Triple: A (failure, action, outcome) record used to evaluate remediation effectiveness and improve the system"
    ],
    "core_concepts_zh": [
      "自主循环（Autonomic Loop）：源自 IBM 自主计算愿景的自愈循环——监控、分析、规划、执行（MAPE-K）",
      "运维手册自动化（Runbook Automation）：AI 代理可执行的编码化操作流程，用于解决已知故障类型",
      "信号关联（Signal Correlation）：聚合指标、日志和追踪以识别根因，而非对单个告警做出反应",
      "爆炸半径评估（Blast Radius Assessment）：在执行自动化修复操作前评估其潜在影响范围",
      "事故三元组（Incident Triple）：（故障、操作、结果）记录，用于评估修复效果并改进系统"
    ],
    "timeline": [
      [
        "2001",
        "IBM publishes the Autonomic Computing Manifesto, envisioning self-managing systems"
      ],
      [
        "2003",
        "Kephart & Chess formalize the MAPE-K loop in The Vision of Autonomic Computing"
      ],
      [
        "2019",
        "Moogsoft and BigPanda popularize AIOps for intelligent alert correlation and noise reduction"
      ],
      [
        "2022",
        "Shoreline.io launches AI-powered automated remediation with LLM-driven runbook execution"
      ],
      [
        "2024",
        "PagerDuty, Datadog, and New Relic integrate LLM agents for automated incident diagnosis and remediation"
      ]
    ],
    "timeline_zh": [
      [
        "2001",
        "IBM 发布自主计算宣言，构想自管理系统"
      ],
      [
        "2003",
        "Kephart 与 Chess 在《自主计算愿景》中正式定义 MAPE-K 循环"
      ],
      [
        "2019",
        "Moogsoft 和 BigPanda 推广 AIOps 智能告警关联和噪声降低"
      ],
      [
        "2022",
        "Shoreline.io 推出 AI 驱动的自动化修复，支持 LLM 驱动的运维手册执行"
      ],
      [
        "2024",
        "PagerDuty、Datadog 和 New Relic 集成 LLM 代理用于自动化事故诊断和修复"
      ]
    ],
    "dos": [
      "Start with well-understood, low-risk remediations (restart service, scale out, clear cache) before automating complex fixes",
      "Require human approval for destructive or high-blast-radius actions even in automated pipelines",
      "Record all automated actions with full context for post-incident review and auditing",
      "Continuously update runbooks based on incident-action-outcome data to improve future remediation"
    ],
    "dos_zh": [
      "从已知低风险修复（重启服务、扩容、清缓存）开始，再逐步自动化复杂修复",
      "即使在自动化管线中，对破坏性或大爆炸半径操作仍要求人工审批",
      "记录所有自动化操作及完整上下文，用于事后复盘和审计",
      "基于事故-操作-结果数据持续更新运维手册以改进未来修复"
    ],
    "donts": [
      "Don't automate remediations you haven't manually validated -- an untested runbook can make incidents worse",
      "Don't suppress alerts when automation kicks in -- operators need visibility into what's being auto-remediated",
      "Don't deploy self-healing without a kill switch -- you must be able to disable automation instantly",
      "Don't conflate symptom treatment with root cause resolution -- restarting a crashing service is a bandaid, not a fix"
    ],
    "donts_zh": [
      "不要自动化未经手动验证的修复——未测试的运维手册可能加剧事故",
      "不要在自动化启动时抑制告警——运维人员需要了解正在被自动修复的内容",
      "不要在没有紧急停止开关的情况下部署自愈系统——必须能立即禁用自动化",
      "不要将症状治疗与根因解决混为一谈——重启崩溃的服务是创可贴而非修复"
    ],
    "case_study_company": "Shoreline.io",
    "case_study": "Shoreline.io built an AI-driven auto-remediation platform that integrates with PagerDuty and Datadog to automatically diagnose and fix infrastructure issues. When a disk-full alert fires, Shoreline's agent identifies the offending log files, archives them, and clears space -- all within 30 seconds. Customers reported an 80% reduction in mean time to remediation (MTTR) for their top 20 most frequent incident types.",
    "case_study_zh": "Shoreline.io 构建了与 PagerDuty 和 Datadog 集成的 AI 驱动自动修复平台，自动诊断和修复基础设施问题。当磁盘满告警触发时，Shoreline 的代理识别问题日志文件、归档并清理空间——全部在 30 秒内完成。客户报告其 Top 20 最频繁事故类型的平均修复时间（MTTR）降低了 80%。",
    "when_not_to_use": [
      "Novel, never-before-seen failure modes where no runbook exists and human judgment is essential",
      "Small-scale systems where manual remediation is fast enough and the automation overhead isn't justified",
      "Environments with poor observability -- self-healing requires high-quality signals to diagnose correctly",
      "Compliance environments where every remediation action must be pre-approved by a human"
    ],
    "when_not_to_use_zh": [
      "没有运维手册、需要人类判断的全新未见故障模式",
      "手动修复已足够快、自动化开销不值得的小规模系统",
      "可观测性差的环境——自愈需要高质量信号才能正确诊断",
      "每项修复操作都必须由人工预先批准的合规环境"
    ],
    "adopters": [
      "Shoreline.io",
      "PagerDuty",
      "Datadog",
      "Netflix (auto-remediation in Spinnaker)",
      "Amazon (AWS Systems Manager)"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "observability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Jeffrey Kephart and David Chess (2003). \"The Vision of Autonomic Computing\". IEEE Computer, 36(1).",
    "secondary_sources": [
      "IBM (2001). \"Autonomic Computing: IBM's Perspective on the State of Information Technology\". IBM Research.",
      "Pankaj Jalote (2013). \"Autonomic Computing Concepts\". Proceedings of the IEEE."
    ],
    "typed_relations": [
      {
        "slug": "chaos-engineering",
        "type": "complement"
      },
      {
        "slug": "circuit-breaker-pattern",
        "type": "extends"
      },
      {
        "slug": "agent-reliability-patterns",
        "type": "complement"
      }
    ]
  },
  {
    "id": 98,
    "name": "AI Observability Framework",
    "name_zh": "AI 系统可观测性框架",
    "slug": "ai-observability-framework",
    "category": "ai",
    "desc": "Trace, monitor, and explain LLM system behavior in prod",
    "desc_zh": "在生产环境中追踪、监控并解释大模型系统行为",
    "steps": [
      "Instrument every LLM call with trace IDs linking prompt, model version, and raw output",
      "Collect latency, token cost, and quality metrics per call and aggregate into dashboards",
      "Implement an LLM-as-judge evaluator that scores output quality on sampled traffic",
      "Set up drift alerts when quality or distribution metrics deviate from baseline",
      "Store prompt-response pairs in an evaluation dataset for offline regression testing"
    ],
    "steps_zh": [
      "为每次大模型调用打上关联提示词、模型版本和原始输出的追踪 ID",
      "收集每次调用的延迟、Token 成本和质量指标并汇总至仪表盘",
      "实现 LLM-as-Judge 评估器，对采样流量的输出质量打分",
      "当质量或分布指标偏离基线时触发漂移告警",
      "将提示词-响应对存入评估数据集，用于离线回归测试"
    ],
    "ai_relevant": true,
    "viz_type": "radar",
    "viz_labels": [
      "Traces",
      "Metrics",
      "Evals",
      "Costs",
      "Safety"
    ],
    "viz_labels_zh": [
      "链路追踪",
      "指标",
      "评估",
      "成本",
      "安全"
    ],
    "related": [
      "llmops",
      "llm-evaluation-framework",
      "four-golden-signals"
    ],
    "tags": [
      "observability",
      "tracing",
      "monitoring",
      "llm-metrics",
      "drift"
    ],
    "origin_author": "Arize AI (founding team), 2020; LangSmith (LangChain), 2023",
    "origin_source": "ML Observability concepts (Arize AI, 2021) and LangSmith: LLM Application Tracing and Evaluation Platform (LangChain, 2023)",
    "origin_source_zh": "ML 可观测性概念（Arize AI，2021）及 LangSmith：大模型应用追踪与评估平台（LangChain，2023）",
    "complexity": "intermediate",
    "when_to_use": [
      "Production LLM applications where you need to monitor quality, cost, and latency continuously",
      "Multi-step agent pipelines where you must trace which step caused a quality degradation",
      "A/B testing new prompts or models where before/after quality comparison requires metrics",
      "Regulated environments requiring audit trails of all AI-generated outputs"
    ],
    "when_to_use_zh": [
      "需要持续监控质量、成本和延迟的生产大模型应用",
      "需要追踪哪个步骤导致质量退化的多步代理管线",
      "需要前后质量对比指标的新提示词或模型 A/B 测试",
      "需要所有 AI 生成输出审计追踪的受监管环境"
    ],
    "core_concepts": [
      "LLM Trace: A structured record of a single LLM call including prompt, completion, model ID, latency, and token counts",
      "Span: A unit of work within a multi-step agent trace (e.g., retrieval span, LLM span, tool span)",
      "LLM-as-Judge: Using a separate LLM to evaluate the quality of another model's outputs on dimensions like relevance and faithfulness",
      "Quality Drift: A gradual degradation in output quality detected via statistical monitoring of evaluation scores over time",
      "Evaluation Dataset: A curated set of prompt-expected_output pairs used for regression testing after prompt or model changes"
    ],
    "core_concepts_zh": [
      "LLM 追踪（LLM Trace）：单次大模型调用的结构化记录，包含提示词、补全、模型 ID、延迟和 Token 计数",
      "Span（跨度）：多步代理追踪中的一个工作单元（如检索 span、LLM span、工具 span）",
      "LLM-as-Judge（大模型评委）：使用独立大模型在相关性和忠实度等维度评估另一个模型的输出质量",
      "质量漂移（Quality Drift）：通过统计监控评估分数随时间推移检测到的输出质量逐渐退化",
      "评估数据集（Evaluation Dataset）：用于提示词或模型变更后回归测试的精选提示词-预期输出对集合"
    ],
    "timeline": [
      [
        "2021",
        "Arize AI pioneers ML observability with embedding drift detection and model monitoring"
      ],
      [
        "2023-03",
        "Weights & Biases launches Prompts, extending experiment tracking to LLM applications"
      ],
      [
        "2023-07",
        "LangSmith launches as a dedicated LLM tracing and evaluation platform"
      ],
      [
        "2023-12",
        "OpenTelemetry community begins standardizing semantic conventions for GenAI/LLM spans"
      ],
      [
        "2024-06",
        "Braintrust, Helicone, and Langfuse emerge as specialized LLM observability platforms"
      ]
    ],
    "timeline_zh": [
      [
        "2021",
        "Arize AI 以嵌入漂移检测和模型监控开创 ML 可观测性领域"
      ],
      [
        "2023-03",
        "Weights & Biases 推出 Prompts 功能，将实验追踪扩展到大模型应用"
      ],
      [
        "2023-07",
        "LangSmith 作为专用大模型追踪和评估平台上线"
      ],
      [
        "2023-12",
        "OpenTelemetry 社区开始标准化 GenAI/LLM span 的语义约定"
      ],
      [
        "2024-06",
        "Braintrust、Helicone 和 Langfuse 作为专业 LLM 可观测性平台涌现"
      ]
    ],
    "dos": [
      "Log full prompt-response pairs, not just metrics -- you'll need the actual text to debug quality issues",
      "Implement LLM-as-judge evaluation on sampled production traffic for continuous quality monitoring",
      "Set up cost dashboards broken down by feature, user segment, and model to optimize spend",
      "Create golden evaluation datasets and run regression tests before deploying prompt changes"
    ],
    "dos_zh": [
      "记录完整提示词-响应对而不只是指标——调试质量问题时需要实际文本",
      "在采样生产流量上实现 LLM-as-Judge 评估以进行持续质量监控",
      "按功能、用户群和模型细分设置成本仪表盘以优化支出",
      "创建黄金评估数据集，在部署提示词变更前运行回归测试"
    ],
    "donts": [
      "Don't log only errors -- quality drift is often subtle and only visible through continuous evaluation",
      "Don't ignore token cost tracking -- LLM costs can spike unpredictably when usage patterns change",
      "Don't evaluate solely with automated metrics -- periodically review samples manually for nuanced issues",
      "Don't store PII in observability logs without proper anonymization and access controls"
    ],
    "donts_zh": [
      "不要仅记录错误——质量漂移通常很微妙，只有通过持续评估才能发现",
      "不要忽视 Token 成本追踪——使用模式变化时 LLM 成本可能不可预测地飙升",
      "不要仅依赖自动化指标评估——定期手动审查样本以发现细微问题",
      "不要在未进行适当脱敏和访问控制的情况下在可观测性日志中存储 PII"
    ],
    "case_study_company": "LangChain (LangSmith)",
    "case_study": "LangChain built LangSmith as a dedicated observability platform for LLM applications after recognizing that traditional APM tools couldn't capture prompt-level traces. LangSmith provides end-to-end tracing of multi-step chains, automatic evaluation scoring, and dataset management. Within a year of launch, over 100,000 developers used LangSmith to debug and optimize their LLM applications, making it the de facto tracing standard for the LangChain ecosystem.",
    "case_study_zh": "LangChain 在认识到传统 APM 工具无法捕获提示词级追踪后，构建了 LangSmith 作为大模型应用专用可观测性平台。LangSmith 提供多步链路的端到端追踪、自动评估打分和数据集管理。上线一年内超过 10 万名开发者使用 LangSmith 调试和优化大模型应用，使其成为 LangChain 生态系统的事实追踪标准。",
    "when_not_to_use": [
      "One-off scripts or experiments where formal observability infrastructure is overkill",
      "Applications with no production users where monitoring provides no actionable signal",
      "Prototypes where the cost of observability tooling exceeds the value of the insights",
      "Systems with strict data sovereignty requirements that prohibit sending traces to third-party platforms"
    ],
    "when_not_to_use_zh": [
      "正式可观测性基础设施过度的一次性脚本或实验",
      "没有生产用户、监控无法提供可操作信号的应用",
      "可观测性工具成本超过洞察价值的原型",
      "严格数据主权要求禁止向第三方平台发送追踪数据的系统"
    ],
    "adopters": [
      "LangChain (LangSmith)",
      "Arize AI (Phoenix)",
      "Helicone",
      "Braintrust",
      "Weights & Biases"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "observability",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Arize AI (2021). \"ML Observability: Monitoring, Troubleshooting, and Explaining Machine Learning Models\". arize.com.",
    "secondary_sources": [
      "Harrison Chase (2023). \"LangSmith: LLM Application Tracing and Evaluation\". langchain.com.",
      "Shreya Shankar et al. (2022). \"Operationalizing Machine Learning: An Interview Study\". arXiv:2209.09125."
    ],
    "typed_relations": [
      {
        "slug": "llmops",
        "type": "complement"
      },
      {
        "slug": "llm-evaluation-framework",
        "type": "complement"
      },
      {
        "slug": "four-golden-signals",
        "type": "extends"
      }
    ]
  },
  {
    "id": 99,
    "name": "Responsible AI Design Framework",
    "name_zh": "负责任 AI 设计框架",
    "slug": "responsible-ai-design",
    "category": "ai",
    "desc": "Embed fairness, safety, and accountability in AI systems",
    "desc_zh": "在 AI 系统中内嵌公平性、安全性与问责机制",
    "steps": [
      "Conduct an AI impact assessment covering bias, privacy, autonomy, and misuse vectors",
      "Define measurable fairness metrics and test across demographic and use-case slices",
      "Implement transparency features: model cards, decision explanations, confidence scores",
      "Establish an accountability chain mapping each AI decision to a responsible human owner",
      "Run red-team adversarial exercises and publish a responsible-use policy for the system"
    ],
    "steps_zh": [
      "开展 AI 影响评估，覆盖偏见、隐私、自主性和滥用向量",
      "定义可量化的公平性指标，在人群和用例切片上进行测试",
      "实现透明度特性：模型卡、决策解释、置信度分数",
      "建立问责链，将每项 AI 决策映射到负责任的人类负责人",
      "开展红队对抗演练，并发布系统的负责任使用政策"
    ],
    "ai_relevant": true,
    "viz_type": "radar",
    "viz_labels": [
      "Fairness",
      "Transparency",
      "Privacy",
      "Safety",
      "Accountability"
    ],
    "viz_labels_zh": [
      "公平性",
      "透明度",
      "隐私",
      "安全",
      "问责"
    ],
    "related": [
      "guardrails-framework",
      "human-in-the-loop",
      "human-ai-interaction-design"
    ],
    "tags": [
      "responsible-ai",
      "fairness",
      "safety",
      "accountability",
      "ethics"
    ],
    "origin_author": "Microsoft (Responsible AI Standard, 2022); Google (AI Principles, 2018); EU AI Act (2024)",
    "origin_source": "Microsoft Responsible AI Standard v2 (2022), Google AI Principles (2018), and EU Artificial Intelligence Act (2024)",
    "origin_source_zh": "微软负责任 AI 标准 v2（2022）、Google AI 原则（2018）及欧盟人工智能法案（2024）",
    "complexity": "advanced",
    "when_to_use": [
      "AI systems making decisions that affect people's lives (hiring, lending, healthcare, criminal justice)",
      "Products deployed in regulated markets subject to the EU AI Act or similar legislation",
      "Consumer-facing AI features where bias or unfair treatment could cause reputational and legal harm",
      "Any AI system operating at scale where undetected bias compounds into systemic discrimination"
    ],
    "when_to_use_zh": [
      "影响人们生活的 AI 决策系统（招聘、贷款、医疗、刑事司法）",
      "在受欧盟 AI 法案或类似法规监管的市场中部署的产品",
      "偏见或不公平对待可能造成声誉和法律损害的面消费者 AI 功能",
      "任何大规模运行、未检测偏见会累积为系统性歧视的 AI 系统"
    ],
    "core_concepts": [
      "Model Card: A standardized document describing a model's intended use, performance metrics, limitations, and ethical considerations",
      "Fairness Metric: A quantitative measure (e.g., demographic parity, equalized odds) that evaluates whether a model treats different groups equitably",
      "Red Teaming: Adversarial testing by dedicated teams that try to elicit harmful, biased, or policy-violating outputs from the model",
      "AI Impact Assessment: A structured review of potential harms covering bias, privacy, safety, and misuse before deployment",
      "Constitutional AI: Anthropic's approach where AI systems are trained to follow a set of principles (a constitution) for safe and helpful behavior"
    ],
    "core_concepts_zh": [
      "模型卡（Model Card）：描述模型预期用途、性能指标、局限性和伦理考量的标准化文档",
      "公平性指标（Fairness Metric）：评估模型是否公平对待不同群体的量化度量（如人口统计均等、均等机会）",
      "红队测试（Red Teaming）：专门团队进行的对抗性测试，尝试引导模型产出有害、偏见或违反政策的输出",
      "AI 影响评估（AI Impact Assessment）：部署前对潜在危害（偏见、隐私、安全、滥用）的结构化审查",
      "Constitutional AI：Anthropic 的方法，训练 AI 系统遵循一组原则（宪法）以实现安全且有帮助的行为"
    ],
    "timeline": [
      [
        "2018-06",
        "Google publishes its AI Principles after employee protests over Project Maven"
      ],
      [
        "2019-01",
        "Margaret Mitchell & Timnit Gebru publish Model Cards for Model Reporting at FAT* 2019"
      ],
      [
        "2022-06",
        "Microsoft releases Responsible AI Standard v2, operationalizing fairness requirements"
      ],
      [
        "2023-10",
        "US Executive Order on AI Safety mandates red-teaming and impact assessments for frontier models"
      ],
      [
        "2024-03",
        "EU AI Act formally adopted, creating the world's first comprehensive AI legislation"
      ]
    ],
    "timeline_zh": [
      [
        "2018-06",
        "Google 在员工抗议 Project Maven 后发布 AI 原则"
      ],
      [
        "2019-01",
        "Margaret Mitchell 和 Timnit Gebru 在 FAT* 2019 发表模型报告的模型卡"
      ],
      [
        "2022-06",
        "微软发布负责任 AI 标准 v2，将公平性要求操作化"
      ],
      [
        "2023-10",
        "美国 AI 安全行政令要求对前沿模型进行红队测试和影响评估"
      ],
      [
        "2024-03",
        "欧盟 AI 法案正式通过，创建全球首部综合性 AI 立法"
      ]
    ],
    "dos": [
      "Conduct fairness audits across demographic slices before deployment, not after incidents occur",
      "Publish model cards and system documentation that honestly describe limitations and known failure modes",
      "Involve diverse stakeholders (legal, ethics, affected communities) in AI impact assessments",
      "Build mechanisms for affected individuals to contest AI decisions and receive human review"
    ],
    "dos_zh": [
      "在部署前而非事故发生后对人口统计切片进行公平性审计",
      "发布诚实描述局限性和已知故障模式的模型卡和系统文档",
      "让多元利益相关者（法律、伦理、受影响社区）参与 AI 影响评估",
      "建立受影响个人可以对 AI 决策提出异议并获得人工审查的机制"
    ],
    "donts": [
      "Don't treat responsible AI as a checkbox exercise -- embed it into the development lifecycle, not just the launch review",
      "Don't assume fairness on aggregate metrics alone -- disaggregate and test across all relevant subgroups",
      "Don't publish AI principles without enforcement mechanisms -- principles without accountability are performative",
      "Don't delay responsible AI practices until scale -- biases baked in early are hardest to fix later"
    ],
    "donts_zh": [
      "不要将负责任 AI 视为勾选项——将其嵌入开发生命周期而非仅在发布审查时",
      "不要仅基于聚合指标假设公平——需分解并测试所有相关子群体",
      "不要在没有执行机制的情况下发布 AI 原则——没有问责的原则是表演性的",
      "不要将负责任 AI 实践推迟到规模化——早期植入的偏见最难在后期修复"
    ],
    "case_study_company": "Microsoft",
    "case_study": "Microsoft's Responsible AI Standard v2 requires every AI product team to complete an impact assessment, fairness evaluation, and transparency documentation before launch. After the Tay chatbot incident in 2016, Microsoft established an Office of Responsible AI and an AI Ethics & Effects in Engineering and Research (Aether) committee. This framework now governs AI features across Azure, Microsoft 365, and Bing, and has been publicly shared to influence industry practices.",
    "case_study_zh": "微软负责任 AI 标准 v2 要求每个 AI 产品团队在发布前完成影响评估、公平性评估和透明度文档。在 2016 年 Tay 聊天机器人事件后，微软成立了负责任 AI 办公室和 AI 伦理与影响工程研究（Aether）委员会。该框架现在治理 Azure、Microsoft 365 和 Bing 中的 AI 功能，并已公开分享以影响行业实践。",
    "when_not_to_use": [
      "Internal research experiments with no deployment path or user-facing impact",
      "Purely creative AI applications with no decision-making authority over people",
      "When the overhead of formal assessments would prevent timely response to urgent needs (apply a lighter process instead)",
      "Systems operating in domains with no disparate impact risk and no regulated data"
    ],
    "when_not_to_use_zh": [
      "没有部署路径或用户影响的内部研究实验",
      "对人不具有决策权的纯创意 AI 应用",
      "正式评估的开销会阻碍紧急需求及时响应时（改用轻量级流程）",
      "在没有差异化影响风险和受监管数据的领域中运行的系统"
    ],
    "adopters": [
      "Microsoft",
      "Google",
      "Anthropic",
      "Meta AI",
      "IBM (AI Fairness 360)"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "security",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Microsoft (2022). \"Microsoft Responsible AI Standard, v2\". microsoft.com.",
    "secondary_sources": [
      "Google (2018). \"AI Principles\". ai.google/principles.",
      "European Union (2024). \"EU Artificial Intelligence Act\". eur-lex.europa.eu."
    ],
    "typed_relations": [
      {
        "slug": "guardrails-framework",
        "type": "complement"
      },
      {
        "slug": "human-in-the-loop",
        "type": "complement"
      },
      {
        "slug": "human-ai-interaction-design",
        "type": "complement"
      }
    ]
  },
  {
    "id": 100,
    "name": "Agent Communication Protocol",
    "name_zh": "代理通信协议模式",
    "slug": "agent-communication-protocol",
    "category": "ai",
    "desc": "Standardize message contracts between autonomous AI agents",
    "desc_zh": "标准化自主 AI 代理之间的消息契约",
    "steps": [
      "Define a canonical message envelope: sender, receiver, intent, payload, correlation ID",
      "Adopt or implement a shared registry where agents advertise their capabilities and schemas",
      "Use async message queues for decoupled agent communication to avoid tight coupling",
      "Implement acknowledgment and retry semantics with idempotency keys for reliability",
      "Version all message schemas and enforce backward-compatibility rules as agents evolve"
    ],
    "steps_zh": [
      "定义规范消息信封：发送方、接收方、意图、载荷、关联 ID",
      "采用或实现共享注册中心，供代理广播自身能力和消息模式",
      "使用异步消息队列进行松耦合的代理通信，避免强依赖",
      "实现带有幂等键的确认与重试语义以保障可靠性",
      "对所有消息模式进行版本管理，并在代理演进时强制向后兼容"
    ],
    "ai_relevant": true,
    "viz_type": "matrix",
    "viz_labels": [
      "Message",
      "Protocol",
      "Agent A",
      "Agent B"
    ],
    "viz_labels_zh": [
      "消息",
      "协议",
      "Agent A",
      "Agent B"
    ],
    "related": [
      "multi-agent-orchestration-pattern",
      "actor-model",
      "eda"
    ],
    "tags": [
      "agent-protocol",
      "messaging",
      "interoperability",
      "standards"
    ],
    "origin_author": "FIPA (Foundation for Intelligent Physical Agents), 1996; modern: Agent Protocol (e2b), 2023; Anthropic MCP, 2024",
    "origin_source": "FIPA Agent Communication Language Specifications (IEEE, 2002) and Agent Protocol open standard (e2b.dev, 2023)",
    "origin_source_zh": "FIPA 代理通信语言规范（IEEE，2002）及 Agent Protocol 开放标准（e2b.dev，2023）",
    "complexity": "advanced",
    "when_to_use": [
      "Multi-agent systems where agents from different frameworks or vendors must interoperate",
      "Enterprise deployments where agent-to-agent communication needs auditability and reliability guarantees",
      "Marketplaces or ecosystems where third-party agents connect to a shared platform",
      "Systems requiring asynchronous, durable communication between long-running agents"
    ],
    "when_to_use_zh": [
      "来自不同框架或供应商的代理必须互操作的多代理系统",
      "代理间通信需要可审计性和可靠性保证的企业部署",
      "第三方代理连接到共享平台的市场或生态系统",
      "需要长运行代理间异步持久通信的系统"
    ],
    "core_concepts": [
      "Message Envelope: A standardized wrapper containing sender, receiver, intent, payload, timestamp, and correlation ID",
      "Capability Registry: A shared directory where agents publish their available skills and the message schemas they accept",
      "Intent: A typed action descriptor (e.g., request, inform, propose, confirm) that classifies the purpose of a message",
      "Idempotency Key: A unique identifier ensuring that retried messages don't cause duplicate side effects",
      "Schema Versioning: A backward-compatible evolution strategy for message formats as agents are updated independently"
    ],
    "core_concepts_zh": [
      "消息信封（Message Envelope）：包含发送方、接收方、意图、载荷、时间戳和关联 ID 的标准化包装",
      "能力注册中心（Capability Registry）：代理发布可用技能和接受的消息模式的共享目录",
      "意图（Intent）：分类消息目的的类型化动作描述符（如请求、通知、提议、确认）",
      "幂等键（Idempotency Key）：确保重试消息不会导致重复副作用的唯一标识符",
      "模式版本管理（Schema Versioning）：代理独立更新时消息格式的向后兼容演进策略"
    ],
    "timeline": [
      [
        "1996",
        "FIPA founded to standardize agent communication; publishes the ACL (Agent Communication Language)"
      ],
      [
        "2002",
        "FIPA-ACL adopted as IEEE standard for multi-agent system interoperability"
      ],
      [
        "2023-08",
        "e2b.dev launches Agent Protocol, an open REST-based standard for AI agent communication"
      ],
      [
        "2024-04",
        "Anthropic publishes Model Context Protocol (MCP), establishing tool-server communication standards"
      ],
      [
        "2024-11",
        "Google A2A (Agent-to-Agent) protocol announced; industry converges on agent interoperability standards"
      ]
    ],
    "timeline_zh": [
      [
        "1996",
        "FIPA 成立以标准化代理通信；发布 ACL（代理通信语言）"
      ],
      [
        "2002",
        "FIPA-ACL 作为多代理系统互操作性的 IEEE 标准被采纳"
      ],
      [
        "2023-08",
        "e2b.dev 发布 Agent Protocol——基于 REST 的 AI 代理通信开放标准"
      ],
      [
        "2024-04",
        "Anthropic 发布模型上下文协议（MCP），建立工具服务器通信标准"
      ],
      [
        "2024-11",
        "Google 宣布 A2A（Agent-to-Agent）协议；行业在代理互操作性标准上趋于融合"
      ]
    ],
    "dos": [
      "Use correlation IDs to trace request-response chains across multi-agent conversations",
      "Design messages as immutable events -- append-only logs make debugging and replay easy",
      "Implement schema validation at both sender and receiver to catch contract violations early",
      "Plan for backward compatibility from day one -- agents will be updated at different times"
    ],
    "dos_zh": [
      "使用关联 ID 追踪多代理对话中的请求-响应链",
      "将消息设计为不可变事件——仅追加日志使调试和重放变得简单",
      "在发送方和接收方都实现模式验证以尽早捕获契约违规",
      "从第一天就规划向后兼容——代理会在不同时间更新"
    ],
    "donts": [
      "Don't rely on natural language for inter-agent messaging -- use structured schemas to prevent misinterpretation",
      "Don't assume all agents are online simultaneously -- design for async communication with message persistence",
      "Don't create custom protocols when an established standard (MCP, Agent Protocol, A2A) fits your use case",
      "Don't skip authentication between agents -- unauthorized agent communication is a security risk"
    ],
    "donts_zh": [
      "不要依赖自然语言进行代理间消息传递——使用结构化模式防止误解",
      "不要假设所有代理同时在线——设计支持消息持久化的异步通信",
      "不要在已有标准（MCP、Agent Protocol、A2A）适用时创建自定义协议",
      "不要跳过代理间的身份认证——未授权的代理通信是安全风险"
    ],
    "case_study_company": "Anthropic",
    "case_study": "Anthropic's Model Context Protocol (MCP) standardized how AI agents communicate with tool servers, creating a universal interface that replaced fragmented custom integrations. Within months of release, MCP was adopted by Cursor, Windsurf, Replit, and dozens of other developer tools. The protocol's simple JSON-RPC-based design and capability negotiation mechanism enabled any tool to be exposed to any MCP-compatible agent without custom glue code.",
    "case_study_zh": "Anthropic 的模型上下文协议（MCP）标准化了 AI 代理与工具服务器的通信方式，创建了取代碎片化自定义集成的通用接口。发布数月内，MCP 被 Cursor、Windsurf、Replit 及数十个其他开发者工具采用。该协议基于 JSON-RPC 的简洁设计和能力协商机制使任何工具都能无需自定义粘合代码即可暴露给任何 MCP 兼容代理。",
    "when_not_to_use": [
      "Single-agent systems with no need for inter-agent communication",
      "Tightly coupled agent pairs where direct function calls are simpler than message passing",
      "Rapid prototypes where protocol overhead slows down experimentation",
      "Homogeneous agent deployments where all agents share the same framework and internal communication suffices"
    ],
    "when_not_to_use_zh": [
      "无需代理间通信的单代理系统",
      "直接函数调用比消息传递更简单的紧耦合代理对",
      "协议开销减慢实验速度的快速原型",
      "所有代理共享同一框架、内部通信即可满足需求的同质代理部署"
    ],
    "adopters": [
      "Anthropic (MCP)",
      "Google (A2A Protocol)",
      "e2b.dev (Agent Protocol)",
      "Microsoft (AutoGen)",
      "Fetch.ai"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "FIPA (2002). \"FIPA Agent Communication Language Specifications\". IEEE Foundation for Intelligent Physical Agents.",
    "secondary_sources": [
      "E2B (2023). \"Agent Protocol: Open Standard for AI Agent Communication\". agentprotocol.ai.",
      "Anthropic (2024). \"Model Context Protocol Specification\". modelcontextprotocol.io."
    ],
    "typed_relations": [
      {
        "slug": "multi-agent-orchestration-pattern",
        "type": "complement"
      },
      {
        "slug": "actor-model",
        "type": "complement"
      },
      {
        "slug": "eda",
        "type": "complement"
      }
    ]
  },
  {
    "id": 191,
    "name": "Agentic Workflow Patterns",
    "name_zh": "智能体工作流模式",
    "slug": "agentic-workflow-patterns",
    "category": "ai",
    "desc": "Plan-execute-reflect loops for autonomous agents",
    "desc_zh": "用于自主智能体的规划-执行-反思循环",
    "steps": [
      "Decompose the user goal into a structured plan with discrete, verifiable sub-tasks",
      "Execute each sub-task using appropriate tools, APIs, or code generation capabilities",
      "After each execution step, reflect on the outcome: verify correctness, check for errors, and assess progress",
      "Revise the plan dynamically based on reflection outputs — add, reorder, or drop sub-tasks as needed",
      "Terminate when all success criteria are met or escalate to a human when confidence is below threshold"
    ],
    "steps_zh": [
      "将用户目标分解为包含离散可验证子任务的结构化计划",
      "使用合适的工具、API 或代码生成能力执行每个子任务",
      "每次执行后进行反思：验证正确性、检查错误并评估进展",
      "基于反思输出动态修订计划——根据需要添加、重排或删除子任务",
      "当所有成功标准满足时终止，或在置信度低于阈值时交由人类处理"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Plan",
      "Execute",
      "Reflect",
      "Adapt"
    ],
    "viz_labels_zh": [
      "规划",
      "执行",
      "反思",
      "适应"
    ],
    "related": [
      "react-framework",
      "multi-agent-orchestration-pattern",
      "prompt-chaining",
      "tool-use-design-pattern"
    ],
    "tags": [
      "agentic",
      "workflow",
      "plan-execute-reflect",
      "autonomous-agents",
      "llm-agents"
    ],
    "origin_author": "Andrew Ng (2024 keynotes); Anthropic, OpenAI, and LangChain agent frameworks",
    "origin_source": "Building Effective Agents (Anthropic, 2024); Andrew Ng's agentic workflow talks (2024)",
    "origin_source_zh": "《构建高效智能体》（Anthropic，2024）；Andrew Ng 的智能体工作流演讲（2024）",
    "complexity": "advanced",
    "when_to_use": [
      "Complex multi-step tasks that require dynamic planning and tool orchestration beyond a single prompt-response",
      "Coding agents that need to write, test, debug, and iterate on code autonomously",
      "Research and analysis workflows where the agent must search, synthesize, and verify information across sources",
      "Customer support automation where the agent must navigate multiple systems and decision trees"
    ],
    "when_to_use_zh": [
      "需要动态规划和工具编排、超越单次提示-响应的复杂多步任务",
      "需要自主编写、测试、调试和迭代代码的编码智能体",
      "智能体需要跨来源搜索、综合和验证信息的研究与分析工作流",
      "智能体需要导航多个系统和决策树的客户支持自动化"
    ],
    "core_concepts": [
      "Plan Phase: The agent decomposes a high-level goal into an ordered sequence of actionable sub-tasks before executing anything",
      "Execute Phase: Each sub-task is carried out using tools, code execution, or API calls, producing observable intermediate results",
      "Reflect Phase: The agent evaluates execution outcomes against expectations, detecting errors and identifying necessary plan revisions",
      "Dynamic Replanning: Unlike static pipelines, agentic workflows can modify their plan mid-execution based on what they learn"
    ],
    "core_concepts_zh": [
      "规划阶段：智能体在执行前将高级目标分解为有序的可操作子任务序列",
      "执行阶段：每个子任务通过工具、代码执行或 API 调用完成，产生可观察的中间结果",
      "反思阶段：智能体根据预期评估执行结果，检测错误并识别必要的计划修订",
      "动态重规划：与静态流水线不同，智能体工作流可以根据学习到的信息在执行中修改计划"
    ],
    "timeline": [
      [
        "2023-03",
        "LangChain introduces Plan-and-Execute agents, separating planning from execution in agent architectures"
      ],
      [
        "2023-11",
        "OpenAI launches Assistants API with built-in multi-step tool use and thread management"
      ],
      [
        "2024-01",
        "Andrew Ng popularizes 'agentic workflows' in his influential keynotes, identifying four key patterns"
      ],
      [
        "2024-12",
        "Anthropic publishes Building Effective Agents, formalizing plan-execute-reflect as an industry pattern"
      ],
      [
        "2025",
        "Agentic coding tools (Claude Code, Cursor, Devin) demonstrate production-grade plan-execute-reflect in software engineering"
      ]
    ],
    "timeline_zh": [
      [
        "2023-03",
        "LangChain 引入 Plan-and-Execute 智能体，在架构中分离规划与执行"
      ],
      [
        "2023-11",
        "OpenAI 推出 Assistants API，内置多步工具使用和线程管理"
      ],
      [
        "2024-01",
        "Andrew Ng 在影响力巨大的演讲中推广「智能体工作流」，识别四种关键模式"
      ],
      [
        "2024-12",
        "Anthropic 发布《构建高效智能体》，将规划-执行-反思正式化为行业模式"
      ],
      [
        "2025",
        "智能体编码工具（Claude Code、Cursor、Devin）在软件工程中展示生产级的规划-执行-反思"
      ]
    ],
    "dos": [
      "Build in explicit reflection steps that verify each sub-task's output before proceeding to the next",
      "Set hard limits on total iterations and token budget to prevent runaway agents that loop indefinitely",
      "Design clear success criteria and termination conditions so the agent knows when it has completed the task",
      "Log full plan-execute-reflect traces for debugging, evaluation, and continuous improvement of agent behavior"
    ],
    "dos_zh": [
      "构建显式反思步骤，在进入下一步之前验证每个子任务的输出",
      "设置总迭代次数和 Token 预算的硬限制，防止智能体无限循环失控",
      "设计明确的成功标准和终止条件，让智能体知道何时完成任务",
      "记录完整的规划-执行-反思轨迹，用于调试、评估和持续改进智能体行为"
    ],
    "donts": [
      "Don't let agents execute irreversible actions (delete, send, deploy) without human confirmation gates",
      "Don't skip the reflect phase to save tokens — unverified execution compounds errors across subsequent steps",
      "Don't use agentic workflows for simple tasks that a single prompt can handle — the overhead is not justified",
      "Don't hardcode plans — the value of agentic workflows lies in dynamic adaptation, not rigid step sequences"
    ],
    "donts_zh": [
      "不要让智能体在没有人类确认门控的情况下执行不可逆操作（删除、发送、部署）",
      "不要为了节省 Token 跳过反思阶段——未验证的执行会在后续步骤中累积错误",
      "不要对单个提示即可处理的简单任务使用智能体工作流——开销不合理",
      "不要硬编码计划——智能体工作流的价值在于动态适应而非固定步骤序列"
    ],
    "case_study_company": "Anthropic",
    "case_study": "Anthropic's Claude Code implements the plan-execute-reflect pattern for autonomous software engineering tasks. When given a complex coding task, Claude Code decomposes it into sub-tasks (understand codebase, plan changes, implement, test, iterate), executes each step with tool use (file read/write, shell commands, search), and reflects on outcomes (checking test results, verifying correctness). This pattern enables Claude Code to handle multi-file refactors, bug fixes, and feature implementations that would require dozens of manual steps.",
    "case_study_zh": "Anthropic 的 Claude Code 为自主软件工程任务实现了规划-执行-反思模式。当面临复杂编码任务时，Claude Code 将其分解为子任务（理解代码库、规划变更、实现、测试、迭代），使用工具（文件读写、Shell 命令、搜索）执行每步，并反思结果（检查测试结果、验证正确性）。这种模式使 Claude Code 能够处理需要数十个手动步骤的多文件重构、Bug 修复和功能实现。",
    "when_not_to_use": [
      "Simple single-turn tasks (classification, summarization, translation) that don't benefit from multi-step planning",
      "Latency-critical applications where the overhead of plan-reflect loops is unacceptable",
      "Tasks with no verifiable intermediate outputs — reflection requires observable signals to be useful",
      "Environments where autonomous tool execution poses unacceptable security or compliance risks"
    ],
    "when_not_to_use_zh": [
      "不需要多步规划的简单单轮任务（分类、摘要、翻译）",
      "规划-反思循环开销不可接受的延迟敏感应用",
      "没有可验证中间输出的任务——反思需要可观察信号才有用",
      "自主工具执行带来不可接受安全或合规风险的环境"
    ],
    "adopters": [
      "Anthropic (Claude Code)",
      "OpenAI (Assistants API)",
      "Cognition (Devin)",
      "LangChain",
      "Microsoft (AutoGen)"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability"
    ],
    "maturity_ring": "experimental",
    "primary_source": "Anthropic (2024). \"Building Effective Agents\". anthropic.com.",
    "secondary_sources": [
      "Andrew Ng (2024). \"Agentic Design Patterns\". deeplearning.ai.",
      "Shunyu Yao et al. (2023). \"ReAct: Synergizing Reasoning and Acting in Language Models\". ICLR 2023."
    ],
    "typed_relations": [
      {
        "slug": "react-framework",
        "type": "complement"
      },
      {
        "slug": "multi-agent-orchestration-pattern",
        "type": "complement"
      },
      {
        "slug": "prompt-chaining",
        "type": "complement"
      },
      {
        "slug": "tool-use-design-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 192,
    "name": "Model Context Protocol (MCP)",
    "name_zh": "模型上下文协议（MCP）",
    "slug": "model-context-protocol-mcp",
    "category": "ai",
    "desc": "Standardized tool integration for LLMs (Anthropic, 2024)",
    "desc_zh": "大模型标准化工具集成协议（Anthropic，2024）",
    "steps": [
      "Define MCP servers that expose tools, resources, and prompts through a standardized JSON-RPC interface",
      "Configure the MCP client (LLM host application) to discover and connect to available MCP servers",
      "The LLM selects and invokes tools via the MCP protocol, passing structured arguments and receiving typed responses",
      "Implement authentication, rate limiting, and access control at the MCP server level for production safety",
      "Compose multiple MCP servers to give the LLM access to a rich, extensible ecosystem of tools and data sources"
    ],
    "steps_zh": [
      "定义通过标准化 JSON-RPC 接口暴露工具、资源和提示的 MCP 服务器",
      "配置 MCP 客户端（大模型宿主应用）以发现和连接可用的 MCP 服务器",
      "大模型通过 MCP 协议选择和调用工具，传递结构化参数并接收类型化响应",
      "在 MCP 服务器层面实现认证、速率限制和访问控制以确保生产安全",
      "组合多个 MCP 服务器，为大模型提供丰富可扩展的工具和数据源生态系统"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Client",
      "MCP Server",
      "Resource",
      "Tool"
    ],
    "viz_labels_zh": [
      "客户端",
      "MCP服务器",
      "资源",
      "工具"
    ],
    "related": [
      "tool-use-design-pattern",
      "react-framework",
      "agentic-workflow-patterns",
      "agent-communication-protocol"
    ],
    "tags": [
      "mcp",
      "tool-integration",
      "protocol",
      "llm-tools",
      "anthropic"
    ],
    "origin_author": "Anthropic, 2024",
    "origin_source": "Model Context Protocol Specification (modelcontextprotocol.io, Anthropic 2024)",
    "origin_source_zh": "模型上下文协议规范（modelcontextprotocol.io，Anthropic 2024）",
    "complexity": "intermediate",
    "when_to_use": [
      "Building LLM applications that need to integrate with external tools, databases, and APIs in a standardized way",
      "Organizations wanting to create reusable tool packages that work across different LLM providers and frameworks",
      "Agent systems that require dynamic tool discovery and composition at runtime",
      "Enterprise environments needing a governance layer (auth, audit, rate limiting) between LLMs and external systems"
    ],
    "when_to_use_zh": [
      "构建需要以标准化方式集成外部工具、数据库和 API 的大模型应用",
      "希望创建可在不同大模型提供商和框架间复用的工具包的组织",
      "需要在运行时动态发现和组合工具的智能体系统",
      "需要在大模型与外部系统之间设置治理层（认证、审计、速率限制）的企业环境"
    ],
    "core_concepts": [
      "MCP Server: A lightweight service that exposes tools, resources, and prompts through a standardized JSON-RPC protocol",
      "MCP Client: The LLM host application that discovers, connects to, and invokes tools on MCP servers on behalf of the model",
      "Tools: Executable functions with typed input/output schemas that the LLM can invoke to perform actions in external systems",
      "Resources: Read-only data sources (files, database queries, API responses) that provide context to the LLM without side effects"
    ],
    "core_concepts_zh": [
      "MCP 服务器：通过标准化 JSON-RPC 协议暴露工具、资源和提示的轻量级服务",
      "MCP 客户端：代表模型发现、连接和调用 MCP 服务器上工具的大模型宿主应用",
      "工具：带有类型化输入/输出模式的可执行函数，大模型可调用以在外部系统中执行操作",
      "资源：只读数据源（文件、数据库查询、API 响应），为大模型提供无副作用的上下文"
    ],
    "timeline": [
      [
        "2024-11",
        "Anthropic announces the Model Context Protocol as an open standard for LLM-tool integration"
      ],
      [
        "2024-12",
        "MCP gains rapid adoption with servers for GitHub, Slack, PostgreSQL, filesystem, and dozens more"
      ],
      [
        "2025-01",
        "OpenAI, Google, and other LLM providers begin adopting MCP, establishing it as an emerging industry standard"
      ],
      [
        "2025-03",
        "Enterprise MCP gateways emerge, adding authentication, audit logging, and rate limiting to MCP server interactions"
      ],
      [
        "2025",
        "MCP ecosystem grows to hundreds of community-built servers, becoming the USB-C of LLM tool integration"
      ]
    ],
    "timeline_zh": [
      [
        "2024-11",
        "Anthropic 宣布模型上下文协议作为大模型工具集成的开放标准"
      ],
      [
        "2024-12",
        "MCP 快速获得采用，出现 GitHub、Slack、PostgreSQL、文件系统等数十个服务器"
      ],
      [
        "2025-01",
        "OpenAI、Google 和其他大模型提供商开始采用 MCP，使其成为新兴行业标准"
      ],
      [
        "2025-03",
        "企业级 MCP 网关涌现，为 MCP 服务器交互添加认证、审计日志和速率限制"
      ],
      [
        "2025",
        "MCP 生态系统增长到数百个社区构建的服务器，成为大模型工具集成的 USB-C"
      ]
    ],
    "dos": [
      "Design MCP servers with single-responsibility — one server per domain (GitHub, database, email) for clean composition",
      "Include comprehensive tool descriptions and parameter documentation so the LLM can select and use tools accurately",
      "Implement proper error handling and typed responses in MCP servers to help the LLM recover gracefully from failures",
      "Use MCP's resource primitive for read-only data to clearly separate safe reads from potentially dangerous tool actions"
    ],
    "dos_zh": [
      "以单一职责原则设计 MCP 服务器——每个领域（GitHub、数据库、邮件）一个服务器以实现清晰组合",
      "包含全面的工具描述和参数文档，使大模型能准确选择和使用工具",
      "在 MCP 服务器中实现适当的错误处理和类型化响应，帮助大模型从故障中优雅恢复",
      "使用 MCP 的资源原语处理只读数据，清晰分离安全读取与潜在危险的工具操作"
    ],
    "donts": [
      "Don't expose destructive operations (delete, overwrite) without requiring explicit user confirmation at the client level",
      "Don't bundle too many unrelated tools into a single MCP server — it reduces the LLM's tool selection accuracy",
      "Don't skip authentication and access control in production MCP servers — unauthenticated tool access is a security risk",
      "Don't assume the LLM will always use tools correctly — validate inputs server-side and return clear error messages"
    ],
    "donts_zh": [
      "不要在客户端层面未要求明确用户确认的情况下暴露破坏性操作（删除、覆盖）",
      "不要在单个 MCP 服务器中捆绑过多不相关的工具——会降低大模型的工具选择准确性",
      "不要在生产 MCP 服务器中跳过认证和访问控制——未认证的工具访问是安全风险",
      "不要假设大模型总会正确使用工具——在服务器端验证输入并返回清晰的错误消息"
    ],
    "case_study_company": "Anthropic",
    "case_study": "Anthropic developed MCP to solve the N-by-M integration problem between LLM applications and external tools. Before MCP, every LLM app had to build custom integrations for each tool. With MCP, Anthropic's Claude Desktop and Claude Code can connect to any MCP server, giving users instant access to GitHub, databases, file systems, and more through a single protocol. The open-source specification attracted contributions from hundreds of developers within weeks of launch, creating the fastest-growing tool integration ecosystem in AI.",
    "case_study_zh": "Anthropic 开发 MCP 以解决大模型应用与外部工具之间的 N×M 集成问题。在 MCP 之前，每个大模型应用必须为每个工具构建自定义集成。使用 MCP 后，Anthropic 的 Claude Desktop 和 Claude Code 可以连接任何 MCP 服务器，通过单一协议让用户即时访问 GitHub、数据库、文件系统等。开源规范在发布数周内就吸引了数百名开发者的贡献，创造了 AI 领域增长最快的工具集成生态系统。",
    "when_not_to_use": [
      "Simple applications with a single, well-defined tool integration that doesn't benefit from protocol abstraction",
      "Environments where all tool access is internal and a direct function call is simpler than a protocol layer",
      "Prototypes where the overhead of setting up MCP servers slows down experimentation",
      "Systems with strict network isolation requirements where running additional server processes is not permitted"
    ],
    "when_not_to_use_zh": [
      "只有单一明确定义的工具集成、不需要协议抽象的简单应用",
      "所有工具访问都是内部的、直接函数调用比协议层更简单的环境",
      "设置 MCP 服务器的开销会减慢实验速度的原型",
      "不允许运行额外服务器进程的严格网络隔离要求的系统"
    ],
    "adopters": [
      "Anthropic",
      "Cursor",
      "Sourcegraph",
      "Replit",
      "Zed"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "usability",
      "maintainability"
    ],
    "maturity_ring": "experimental",
    "primary_source": "Anthropic (2024). \"Model Context Protocol Specification\". modelcontextprotocol.io.",
    "secondary_sources": [
      "Anthropic (2024). \"Introducing the Model Context Protocol\". anthropic.com/news.",
      "FIPA (2002). \"FIPA Agent Communication Language Specifications\". IEEE Foundation for Intelligent Physical Agents."
    ],
    "typed_relations": [
      {
        "slug": "tool-use-design-pattern",
        "type": "extends"
      },
      {
        "slug": "react-framework",
        "type": "complement"
      },
      {
        "slug": "agentic-workflow-patterns",
        "type": "complement"
      },
      {
        "slug": "agent-communication-protocol",
        "type": "complement"
      }
    ]
  },
  {
    "id": 193,
    "name": "Retrieval-Augmented Fine-Tuning (RAFT)",
    "name_zh": "检索增强微调（RAFT）",
    "slug": "retrieval-augmented-fine-tuning-raft",
    "category": "ai",
    "desc": "Combine RAG with fine-tuning for domain adaptation",
    "desc_zh": "结合检索增强生成与微调实现领域适配",
    "steps": [
      "Prepare a domain-specific corpus and generate question-answer pairs with supporting and distractor documents",
      "Fine-tune the LLM on examples that include both relevant (oracle) and irrelevant (distractor) retrieved documents",
      "Train the model to cite chain-of-thought reasoning from the relevant documents while ignoring distractors",
      "Evaluate the fine-tuned model on held-out domain questions, comparing against pure RAG and pure fine-tuning baselines",
      "Deploy the RAFT model with a production RAG pipeline, benefiting from both parametric knowledge and retrieval grounding"
    ],
    "steps_zh": [
      "准备领域特定语料库，生成包含支持文档和干扰文档的问答对",
      "使用同时包含相关（正例）和不相关（干扰）检索文档的示例微调大模型",
      "训练模型从相关文档中引用思维链推理，同时忽略干扰文档",
      "在保留的领域问题上评估微调模型，与纯 RAG 和纯微调基线进行比较",
      "将 RAFT 模型与生产 RAG 流水线一起部署，同时受益于参数知识和检索锚定"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Dataset",
      "Distract Docs",
      "Fine-tune",
      "Evaluate"
    ],
    "viz_labels_zh": [
      "训练数据",
      "干扰文档",
      "微调",
      "评估"
    ],
    "related": [
      "rag-architecture",
      "prompt-chaining",
      "llm-evaluation-framework"
    ],
    "tags": [
      "raft",
      "rag",
      "fine-tuning",
      "domain-adaptation",
      "retrieval"
    ],
    "origin_author": "Tianjun Zhang et al. (UC Berkeley), 2024",
    "origin_source": "RAFT: Adapting Language Model to Domain Specific RAG (Tianjun Zhang et al., arXiv 2024)",
    "origin_source_zh": "RAFT：使语言模型适配领域特定检索增强生成（Tianjun Zhang 等，arXiv 2024）",
    "complexity": "advanced",
    "when_to_use": [
      "Domain-specific applications (legal, medical, financial) where both retrieval accuracy and domain knowledge matter",
      "Enterprise RAG systems where pure retrieval produces too many irrelevant results that confuse the LLM",
      "Use cases requiring high factual accuracy with citation — the model must distinguish relevant from irrelevant context",
      "Scenarios where pure fine-tuning hallucinates on recent or niche information that wasn't in training data"
    ],
    "when_to_use_zh": [
      "检索准确性和领域知识都重要的领域特定应用（法律、医疗、金融）",
      "纯检索产生过多不相关结果导致大模型混淆的企业 RAG 系统",
      "需要带引用的高事实准确性的场景——模型必须区分相关与不相关上下文",
      "纯微调对训练数据中不存在的最新或小众信息产生幻觉的场景"
    ],
    "core_concepts": [
      "Oracle Documents: The relevant retrieved documents that contain the answer — the model must learn to identify and cite these",
      "Distractor Documents: Irrelevant retrieved documents mixed into training examples to teach the model robustness against retrieval noise",
      "Chain-of-Thought Citation: The model is trained to produce reasoning traces that explicitly reference relevant passages, improving verifiability",
      "Hybrid Knowledge: RAFT combines parametric knowledge (from fine-tuning) with non-parametric knowledge (from retrieval) for superior domain performance"
    ],
    "core_concepts_zh": [
      "正例文档：包含答案的相关检索文档——模型必须学会识别和引用这些文档",
      "干扰文档：混入训练示例中的不相关检索文档，教会模型对检索噪声的鲁棒性",
      "思维链引用：模型被训练生成明确引用相关段落的推理轨迹，提高可验证性",
      "混合知识：RAFT 将参数知识（来自微调）与非参数知识（来自检索）结合，实现卓越的领域性能"
    ],
    "timeline": [
      [
        "2020",
        "RAG (Lewis et al.) establishes retrieval-augmented generation as a foundational LLM pattern"
      ],
      [
        "2023",
        "Enterprise RAG deployments reveal limitations: retrieval noise, hallucination, and poor domain adaptation"
      ],
      [
        "2024-03",
        "Tianjun Zhang et al. publish RAFT, demonstrating that fine-tuning with retrieval context outperforms both pure RAG and pure fine-tuning"
      ],
      [
        "2024-06",
        "RAFT methodology adopted by enterprise AI teams for legal, medical, and financial domain adaptation"
      ],
      [
        "2025",
        "Automated RAFT dataset generation tools emerge, reducing the manual effort of creating oracle/distractor training pairs"
      ]
    ],
    "timeline_zh": [
      [
        "2020",
        "RAG（Lewis 等）确立检索增强生成作为基础大模型模式"
      ],
      [
        "2023",
        "企业 RAG 部署暴露局限性：检索噪声、幻觉和领域适配不足"
      ],
      [
        "2024-03",
        "Tianjun Zhang 等发表 RAFT，证明使用检索上下文微调优于纯 RAG 和纯微调"
      ],
      [
        "2024-06",
        "RAFT 方法被企业 AI 团队用于法律、医疗和金融领域适配"
      ],
      [
        "2025",
        "自动化 RAFT 数据集生成工具出现，减少创建正例/干扰训练对的人工工作"
      ]
    ],
    "dos": [
      "Include a mix of oracle-only and oracle-plus-distractor examples during training to build robust retrieval discrimination",
      "Train the model to produce explicit chain-of-thought citations that trace answers back to specific retrieved passages",
      "Evaluate RAFT models against both pure RAG and pure fine-tuning baselines to quantify the hybrid benefit",
      "Use domain experts to validate training QA pairs — garbage-in training data produces garbage-out RAFT models"
    ],
    "dos_zh": [
      "在训练中混合仅正例和正例加干扰的示例，以建立鲁棒的检索区分能力",
      "训练模型生成明确的思维链引用，将答案追溯至特定检索段落",
      "将 RAFT 模型与纯 RAG 和纯微调基线进行比较评估，量化混合收益",
      "使用领域专家验证训练问答对——垃圾训练数据产出垃圾 RAFT 模型"
    ],
    "donts": [
      "Don't skip distractor documents in training — a model trained only on oracle documents fails when real retrieval returns noise",
      "Don't use RAFT as a substitute for good retrieval — poor retrieval quality undermines even a well-fine-tuned model",
      "Don't fine-tune on stale domain data — RAFT models inherit the biases and gaps of their training corpus",
      "Don't ignore the cost of fine-tuning iteration — RAFT requires more compute than pure RAG, so validate the ROI first"
    ],
    "donts_zh": [
      "不要在训练中跳过干扰文档——仅用正例文档训练的模型在真实检索返回噪声时会失败",
      "不要将 RAFT 视为良好检索的替代——糟糕的检索质量会削弱即使微调良好的模型",
      "不要使用过时的领域数据微调——RAFT 模型会继承训练语料的偏见和缺口",
      "不要忽视微调迭代的成本——RAFT 比纯 RAG 需要更多计算资源，先验证投资回报率"
    ],
    "case_study_company": "UC Berkeley",
    "case_study": "UC Berkeley researchers demonstrated RAFT on domain-specific benchmarks including PubMed (medical), HotpotQA (multi-hop reasoning), and Gorilla API documentation. The RAFT-trained Llama-7B model outperformed both standard RAG with GPT-3.5 and domain-fine-tuned models on PubMed QA by 15-20%, while maintaining the ability to cite specific passages. This proved that teaching models to read retrieval results during fine-tuning produces substantially better domain specialists than either approach alone.",
    "case_study_zh": "加州大学伯克利分校的研究人员在领域特定基准上展示了 RAFT，包括 PubMed（医学）、HotpotQA（多跳推理）和 Gorilla API 文档。经 RAFT 训练的 Llama-7B 模型在 PubMed QA 上比标准 RAG + GPT-3.5 和领域微调模型高出 15-20%，同时保持引用特定段落的能力。这证明在微调过程中教模型阅读检索结果，比任何单一方法都能产出更好的领域专家。",
    "when_not_to_use": [
      "General-purpose chatbots where domain specialization is not needed and pure RAG is sufficient",
      "Rapidly changing domains where the fine-tuning cycle cannot keep pace with data updates",
      "Teams without the compute budget or MLOps infrastructure to manage fine-tuning pipelines",
      "Use cases where the base model's zero-shot performance with RAG already meets quality requirements"
    ],
    "when_not_to_use_zh": [
      "不需要领域专业化、纯 RAG 即可满足的通用聊天机器人",
      "微调周期无法跟上数据更新速度的快速变化领域",
      "没有计算预算或 MLOps 基础设施来管理微调流水线的团队",
      "基础模型使用 RAG 的零样本表现已满足质量要求的场景"
    ],
    "adopters": [
      "UC Berkeley",
      "Microsoft Research",
      "Anyscale",
      "Databricks",
      "Together AI"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "performance",
      "reliability"
    ],
    "maturity_ring": "experimental",
    "primary_source": "Tianjun Zhang et al. (2024). \"RAFT: Adapting Language Model to Domain Specific RAG\". arXiv:2403.10131.",
    "secondary_sources": [
      "Patrick Lewis et al. (2020). \"Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks\". NeurIPS 2020.",
      "Edward Hu et al. (2021). \"LoRA: Low-Rank Adaptation of Large Language Models\". arXiv:2106.09685."
    ],
    "typed_relations": [
      {
        "slug": "rag-architecture",
        "type": "extends"
      },
      {
        "slug": "prompt-chaining",
        "type": "complement"
      },
      {
        "slug": "llm-evaluation-framework",
        "type": "complement"
      }
    ]
  },
  {
    "id": 194,
    "name": "AI Safety Layers (Defense in Depth for AI)",
    "name_zh": "AI 安全层（AI 纵深防御）",
    "slug": "ai-safety-layers",
    "category": "ai",
    "desc": "Multi-layer AI safety architecture",
    "desc_zh": "多层 AI 安全架构",
    "steps": [
      "Define the threat model: prompt injection, jailbreaks, data leakage, harmful content, and unauthorized actions",
      "Implement input guardrails: content filters, prompt validation, and injection detection before the LLM processes any request",
      "Apply model-level safety: system prompts with safety instructions, Constitutional AI training, and output classifiers",
      "Add output guardrails: content filtering, PII detection, hallucination checks, and citation verification on generated responses",
      "Deploy operational safety: rate limiting, audit logging, human-in-the-loop for high-risk actions, and continuous red-teaming"
    ],
    "steps_zh": [
      "定义威胁模型：提示注入、越狱攻击、数据泄露、有害内容和未授权操作",
      "实施输入防护：内容过滤、提示验证和注入检测，在大模型处理请求之前执行",
      "应用模型层安全：带安全指令的系统提示词、Constitutional AI 训练和输出分类器",
      "添加输出防护：对生成响应进行内容过滤、PII 检测、幻觉检查和引用验证",
      "部署运维安全：速率限制、审计日志、高风险操作的人机协同和持续红队测试"
    ],
    "ai_relevant": true,
    "viz_type": "pyramid",
    "viz_labels": [
      "Model Safety",
      "App Guard",
      "Infra",
      "Human"
    ],
    "viz_labels_zh": [
      "模型安全",
      "应用护栏",
      "基础设施",
      "人工监管"
    ],
    "related": [
      "responsible-ai-design",
      "guardrails-framework",
      "llm-evaluation-framework",
      "human-in-the-loop"
    ],
    "tags": [
      "ai-safety",
      "defense-in-depth",
      "guardrails",
      "security",
      "responsible-ai"
    ],
    "origin_author": "Industry convergence: Anthropic (Constitutional AI), NIST (AI RMF), OWASP (LLM Top 10), 2023-2024",
    "origin_source": "NIST AI Risk Management Framework (2023); OWASP Top 10 for LLM Applications (2023); Anthropic safety research (2023-2024)",
    "origin_source_zh": "NIST AI 风险管理框架（2023）；OWASP 大模型应用十大风险（2023）；Anthropic 安全研究（2023-2024）",
    "complexity": "advanced",
    "when_to_use": [
      "Any production LLM application that interacts with end users or processes sensitive data",
      "Enterprise AI deployments subject to regulatory requirements (EU AI Act, HIPAA, SOC 2)",
      "Agentic systems with tool access that can take real-world actions (send emails, modify data, execute code)",
      "Customer-facing AI products where brand reputation depends on safe, reliable model behavior"
    ],
    "when_to_use_zh": [
      "任何与终端用户交互或处理敏感数据的生产大模型应用",
      "受监管要求约束（EU AI Act、HIPAA、SOC 2）的企业 AI 部署",
      "可执行真实操作（发送邮件、修改数据、执行代码）的工具调用智能体系统",
      "品牌声誉依赖安全可靠模型行为的面向客户的 AI 产品"
    ],
    "core_concepts": [
      "Defense in Depth: No single safety layer is sufficient — multiple independent layers ensure that if one fails, others catch the threat",
      "Input Guardrails: Pre-processing filters that detect and block prompt injection, jailbreaks, and malicious inputs before they reach the model",
      "Output Guardrails: Post-processing checks that filter harmful content, detect PII leakage, and verify factual accuracy of generated responses",
      "Operational Safety: Runtime controls including rate limiting, audit logging, human-in-the-loop gates, and continuous adversarial testing"
    ],
    "core_concepts_zh": [
      "纵深防御：任何单一安全层都不够——多个独立层确保一层失败时其他层能捕获威胁",
      "输入防护：在请求到达模型前检测和阻止提示注入、越狱攻击和恶意输入的预处理过滤器",
      "输出防护：过滤有害内容、检测 PII 泄露和验证生成响应事实准确性的后处理检查",
      "运维安全：包括速率限制、审计日志、人机协同门控和持续对抗性测试的运行时控制"
    ],
    "timeline": [
      [
        "2022-12",
        "Anthropic publishes Constitutional AI, introducing model-level safety training via self-critique and revision"
      ],
      [
        "2023-01",
        "NIST releases the AI Risk Management Framework, establishing a structured approach to AI safety governance"
      ],
      [
        "2023-08",
        "OWASP publishes the Top 10 for LLM Applications, cataloging the most critical LLM security risks"
      ],
      [
        "2024",
        "Guardrails-as-a-service platforms (Guardrails AI, Lakera, Robust Intelligence) emerge for production LLM safety"
      ],
      [
        "2025",
        "Multi-layer safety architectures become standard in enterprise AI, with automated red-teaming integrated into CI/CD"
      ]
    ],
    "timeline_zh": [
      [
        "2022-12",
        "Anthropic 发表 Constitutional AI，引入通过自我批评和修订的模型层安全训练"
      ],
      [
        "2023-01",
        "NIST 发布 AI 风险管理框架，建立结构化的 AI 安全治理方法"
      ],
      [
        "2023-08",
        "OWASP 发布大模型应用十大风险，编录最关键的大模型安全风险"
      ],
      [
        "2024",
        "防护栏即服务平台（Guardrails AI、Lakera、Robust Intelligence）涌现，用于生产大模型安全"
      ],
      [
        "2025",
        "多层安全架构成为企业 AI 标准，自动化红队测试集成到 CI/CD 中"
      ]
    ],
    "dos": [
      "Implement independent safety layers at input, model, output, and operational levels — defense in depth requires redundancy",
      "Red-team your system regularly with adversarial prompts to discover bypass vectors before attackers do",
      "Log all LLM interactions for audit and incident investigation, with appropriate data retention policies",
      "Keep safety classifiers and guardrail rules updated as new attack techniques emerge — AI safety is an arms race"
    ],
    "dos_zh": [
      "在输入、模型、输出和运维层面实施独立安全层——纵深防御需要冗余",
      "定期使用对抗性提示对系统进行红队测试，在攻击者之前发现绕过向量",
      "记录所有大模型交互以供审计和事件调查，并制定适当的数据保留策略",
      "随着新攻击技术出现持续更新安全分类器和防护规则——AI 安全是一场军备竞赛"
    ],
    "donts": [
      "Don't rely solely on the model's built-in safety training — it can be bypassed and must be supplemented with external guardrails",
      "Don't treat AI safety as a one-time setup — threats evolve continuously and safety systems must be maintained actively",
      "Don't skip output validation because the input was clean — the model can generate harmful content from benign inputs",
      "Don't deploy agentic AI with tool access without human-in-the-loop gates for irreversible actions"
    ],
    "donts_zh": [
      "不要仅依赖模型内置的安全训练——它可以被绕过，必须辅以外部防护栏",
      "不要将 AI 安全视为一次性设置——威胁持续演化，安全系统必须主动维护",
      "不要因为输入干净就跳过输出验证——模型可以从良性输入生成有害内容",
      "不要在没有对不可逆操作设置人机协同门控的情况下部署带工具访问的智能体 AI"
    ],
    "case_study_company": "Anthropic",
    "case_study": "Anthropic implements a comprehensive multi-layer safety architecture for Claude. The system includes input classifiers that detect prompt injection and jailbreak attempts, Constitutional AI training that teaches the model to self-critique and revise harmful outputs, output filters that catch residual harmful content, and operational controls including rate limiting and usage monitoring. This layered approach enables Claude to be deployed in high-stakes enterprise environments while maintaining strong safety properties — no single layer is trusted to be sufficient alone.",
    "case_study_zh": "Anthropic 为 Claude 实施了全面的多层安全架构。系统包括检测提示注入和越狱尝试的输入分类器、教模型自我批评和修订有害输出的 Constitutional AI 训练、捕获残留有害内容的输出过滤器，以及包含速率限制和使用监控的运维控制。这种分层方法使 Claude 能够在高风险企业环境中部署，同时保持强大的安全属性——不信任任何单一层足以独立防护。",
    "when_not_to_use": [
      "Internal research and experimentation environments where safety constraints would impede model evaluation",
      "Non-interactive batch processing where the model has no exposure to adversarial user input",
      "Toy projects and demos where the overhead of multi-layer safety is disproportionate to the risk",
      "Fully sandboxed environments with no access to sensitive data or external systems"
    ],
    "when_not_to_use_zh": [
      "安全约束会妨碍模型评估的内部研究和实验环境",
      "模型不暴露于对抗性用户输入的非交互式批处理",
      "多层安全开销与风险不成比例的玩具项目和演示",
      "无法访问敏感数据或外部系统的完全沙盒化环境"
    ],
    "adopters": [
      "Anthropic",
      "OpenAI",
      "Google DeepMind",
      "Microsoft",
      "Guardrails AI"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "security",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "NIST (2023). \"AI Risk Management Framework (AI RMF 1.0)\". nist.gov.",
    "secondary_sources": [
      "OWASP (2023). \"OWASP Top 10 for Large Language Model Applications\". owasp.org.",
      "Yuntao Bai et al. (2022). \"Constitutional AI: Harmlessness from AI Feedback\". arXiv:2212.08073. Anthropic."
    ],
    "typed_relations": [
      {
        "slug": "responsible-ai-design",
        "type": "complement"
      },
      {
        "slug": "guardrails-framework",
        "type": "complement"
      },
      {
        "slug": "llm-evaluation-framework",
        "type": "complement"
      },
      {
        "slug": "human-in-the-loop",
        "type": "complement"
      }
    ]
  },
  {
    "id": 293,
    "name": "Multimodal Pipeline Design",
    "name_zh": "多模态流水线设计",
    "slug": "multimodal-pipeline-design",
    "category": "ai",
    "desc": "Architecture for processing text, image, audio, and video in unified AI pipelines",
    "desc_zh": "在统一 AI 流水线中处理文本、图像、音频和视频的架构",
    "steps": [
      "Identify the modalities required by the task and select or fine-tune a backbone model capable of handling each modality",
      "Design modality-specific preprocessing stages: tokenization for text, patch embeddings for images, spectrograms for audio",
      "Define a fusion strategy — early fusion (concatenate inputs), late fusion (merge outputs), or cross-attention across modalities",
      "Build a shared embedding space so representations from different modalities can be compared, retrieved, or jointly reasoned over",
      "Evaluate pipeline output with modality-specific and cross-modal metrics; iterate on fusion strategy and preprocessing based on failure analysis"
    ],
    "steps_zh": [
      "识别任务所需的模态，选择或微调能够处理每种模态的骨干模型",
      "设计针对各模态的预处理阶段：文本的分词、图像的块嵌入、音频的频谱图",
      "定义融合策略——早期融合（拼接输入）、晚期融合（合并输出）或跨模态注意力",
      "构建共享嵌入空间，使不同模态的表示可以被比较、检索或联合推理",
      "使用特定模态和跨模态指标评估流水线输出；根据失败分析迭代融合策略和预处理"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Input",
      "Encode",
      "Fuse",
      "Generate"
    ],
    "viz_labels_zh": [
      "多模态输入",
      "编码",
      "模态融合",
      "生成输出"
    ],
    "related": [
      "prompt-chaining",
      "multi-agent-orchestration-pattern",
      "llm-evaluation-framework"
    ],
    "tags": [
      "multimodal",
      "pipeline",
      "vision",
      "audio",
      "embeddings",
      "llm"
    ],
    "origin_author": "Google DeepMind",
    "origin_source": "Google DeepMind (2023). \"Gemini: A Family of Highly Capable Multimodal Models\". arXiv:2312.11805.",
    "origin_source_zh": "Google DeepMind（2023）。《Gemini：高能力多模态模型家族》。arXiv:2312.11805。",
    "complexity": "advanced",
    "when_to_use": [
      "Applications where the input is inherently multimodal — e.g., document understanding (PDF text + images), video captioning, or audio transcription with speaker diarization",
      "AI assistants that need to accept arbitrary file uploads and reason over mixed content types in a single context",
      "Retrieval systems that must index and search across modalities — finding images by text query or audio clips by semantic description",
      "Production systems where different modality models must be orchestrated reliably with shared observability and error handling"
    ],
    "when_to_use_zh": [
      "输入本质上是多模态的应用——如文档理解（PDF 文本 + 图像）、视频字幕或带说话人区分的音频转录",
      "需要接受任意文件上传并在单一上下文中对混合内容类型进行推理的 AI 助手",
      "必须跨模态索引和搜索的检索系统——通过文本查询查找图像或通过语义描述查找音频片段",
      "不同模态模型必须与共享可观测性和错误处理可靠协调的生产系统"
    ],
    "core_concepts": [
      "Modality Encoding: Each input type is converted to a dense vector representation using a modality-specific encoder (CLIP for images, Whisper for audio, transformer for text)",
      "Fusion Architecture: Early fusion combines raw or encoded inputs before the reasoning model; late fusion combines independent model outputs; cross-attention enables modalities to attend to each other",
      "Shared Embedding Space: Contrastive training (CLIP-style) aligns representations from different modalities so semantic similarity is comparable across modality boundaries",
      "Modality Routing: A dispatcher determines which modality encoder, preprocessor, and downstream model handles each segment of a mixed input"
    ],
    "core_concepts_zh": [
      "模态编码：每种输入类型使用特定模态编码器转换为密集向量表示（图像用 CLIP、音频用 Whisper、文本用 Transformer）",
      "融合架构：早期融合在推理模型之前合并原始或编码输入；晚期融合合并独立模型输出；交叉注意力使模态之间相互关注",
      "共享嵌入空间：对比训练（CLIP 风格）对齐不同模态的表示，使跨模态边界的语义相似性可以比较",
      "模态路由：调度器确定哪个模态编码器、预处理器和下游模型处理混合输入的每个片段"
    ],
    "timeline": [
      [
        "2021",
        "OpenAI releases CLIP, demonstrating that contrastive pretraining can align image and text in a shared embedding space"
      ],
      [
        "2022",
        "DeepMind's Flamingo and Google's PaLI establish large-scale vision-language models as a viable architecture for joint reasoning"
      ],
      [
        "2023",
        "GPT-4V and Gemini Ultra launch as general-purpose multimodal models capable of native image, text, and audio understanding"
      ],
      [
        "2024",
        "Multimodal models become standard in production AI applications; frameworks like LangChain and LlamaIndex add native multimodal pipeline support"
      ]
    ],
    "timeline_zh": [
      [
        "2021",
        "OpenAI 发布 CLIP，证明对比预训练可以在共享嵌入空间中对齐图像和文本"
      ],
      [
        "2022",
        "DeepMind 的 Flamingo 和 Google 的 PaLI 确立了大规模视觉语言模型作为联合推理可行架构"
      ],
      [
        "2023",
        "GPT-4V 和 Gemini Ultra 作为具备原生图像、文本和音频理解能力的通用多模态模型发布"
      ],
      [
        "2024",
        "多模态模型成为生产 AI 应用的标准；LangChain 和 LlamaIndex 等框架添加原生多模态流水线支持"
      ]
    ],
    "dos": [
      "Do evaluate each modality encoder independently before integration so that performance regressions can be attributed to a specific pipeline stage",
      "Do design modality preprocessing as stateless, composable functions so individual encoders can be swapped or upgraded without re-architecting the pipeline",
      "Do include cross-modal evaluation benchmarks (e.g., VQA for vision-language, AudioCaps for audio-text) alongside single-modality metrics",
      "Do instrument modality routing decisions in production so you can analyze which modalities are actually used and optimize pipeline cost accordingly"
    ],
    "dos_zh": [
      "务必在集成前独立评估每个模态编码器，使性能回归可以归因于特定流水线阶段",
      "务必将模态预处理设计为无状态的、可组合的函数，使单个编码器可以在不重新架构流水线的情况下替换或升级",
      "务必在单模态指标旁边包含跨模态评估基准（如视觉语言的 VQA、音频文本的 AudioCaps）",
      "务必在生产中记录模态路由决策，以便分析实际使用了哪些模态并相应优化流水线成本"
    ],
    "donts": [
      "Don't assume a multimodal model handles all modalities equally well — benchmark each modality independently and set separate quality thresholds",
      "Don't pass raw binary file bytes directly to an LLM API without explicit preprocessing — unstructured blobs produce unpredictable model behavior",
      "Don't neglect per-modality latency profiling — image encoding can dominate pipeline latency and must be optimized separately from the text generation step",
      "Don't conflate multimodal input with multimodal output — generating images, audio, or video requires separate output decoders and a distinct safety review"
    ],
    "donts_zh": [
      "不要假设多模态模型在所有模态上表现同等出色——独立对每种模态进行基准测试并设置单独的质量阈值",
      "不要在没有明确预处理的情况下直接将原始二进制文件字节传递给 LLM API——非结构化二进制块产生不可预测的模型行为",
      "不要忽视按模态的延迟分析——图像编码可能主导流水线延迟，必须与文本生成步骤分开优化",
      "不要混淆多模态输入与多模态输出——生成图像、音频或视频需要单独的输出解码器和独立的安全审查"
    ],
    "case_study_company": "Google DeepMind",
    "case_study": "Google DeepMind's Gemini architecture demonstrated the first natively multimodal large model trained jointly across text, image, audio, and video from scratch rather than stitching together separate models. The Gemini Ultra pipeline uses a unified Transformer that accepts interleaved multimodal tokens, enabling tasks like reading a chart and answering a verbal question about it in a single forward pass. This architecture achieved state-of-the-art results on 30 of 32 multimodal benchmarks at launch, validating the joint-training approach over late-fusion ensemble alternatives.",
    "case_study_zh": "Google DeepMind 的 Gemini 架构展示了第一个从头在文本、图像、音频和视频上联合训练的原生多模态大模型，而非拼接独立模型。Gemini Ultra 流水线使用统一的 Transformer 接受交织的多模态 Token，支持在单次前向传播中完成读取图表并回答口头问题等任务。此架构在发布时在32个多模态基准中的30个上达到了最先进的结果，验证了联合训练方法优于晚期融合集成替代方案。",
    "when_not_to_use": [
      "Applications that are genuinely text-only where multimodal complexity adds cost and latency without any benefit",
      "Prototypes or MVPs where a single-modality model is sufficient to validate the core product hypothesis before investing in multimodal infrastructure",
      "Low-latency real-time applications where the sequential preprocessing overhead of multiple modality encoders violates latency SLAs",
      "Teams without the ML engineering expertise to debug cross-modal embedding alignment issues and modality-specific preprocessing failures"
    ],
    "when_not_to_use_zh": [
      "真正纯文本的应用——多模态复杂性增加了成本和延迟而没有任何收益",
      "单模态模型足以验证核心产品假设的原型或 MVP——在投资多模态基础设施前",
      "多个模态编码器的顺序预处理开销违反延迟 SLA 的低延迟实时应用",
      "缺乏调试跨模态嵌入对齐问题和特定模态预处理失败的 ML 工程专业知识的团队"
    ],
    "adopters": [
      "Google DeepMind",
      "OpenAI",
      "Anthropic",
      "Meta AI",
      "Microsoft",
      "Hugging Face"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "performance",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Google DeepMind (2023). \"Gemini: A Family of Highly Capable Multimodal Models\". arXiv:2312.11805.",
    "secondary_sources": [
      "Alec Radford et al. (2021). \"Learning Transferable Visual Models From Natural Language Supervision (CLIP)\". OpenAI. arXiv:2103.00020.",
      "Jean-Baptiste Alayrac et al. (2022). \"Flamingo: a Visual Language Model for Few-Shot Learning\". DeepMind. arXiv:2204.14198.",
      "Haotian Liu et al. (2023). \"Visual Instruction Tuning (LLaVA)\". arXiv:2304.08485."
    ],
    "typed_relations": [
      {
        "slug": "prompt-chaining",
        "type": "complement"
      },
      {
        "slug": "multi-agent-orchestration-pattern",
        "type": "complement"
      },
      {
        "slug": "llm-evaluation-framework",
        "type": "complement"
      }
    ]
  },
  {
    "id": 294,
    "name": "AI Cost Optimization",
    "name_zh": "AI 成本优化",
    "slug": "ai-cost-optimization",
    "category": "ai",
    "desc": "Systematic strategies for managing and reducing LLM inference costs at production scale",
    "desc_zh": "在生产规模下系统性管理和降低大模型推理成本的策略",
    "steps": [
      "Instrument your LLM usage: capture token counts, model tier, latency, and cost per request to establish a baseline cost profile",
      "Classify requests by complexity and route to the cheapest model tier that meets the quality threshold for each task type",
      "Reduce input token volume through prompt compression, context summarization, and retrieval-augmented generation over full context injection",
      "Cache deterministic or near-deterministic LLM responses using semantic caching to eliminate redundant inference calls",
      "Evaluate output quality vs. cost tradeoffs continuously using automated eval frameworks to ensure cost reduction does not degrade user outcomes"
    ],
    "steps_zh": [
      "对 LLM 使用情况进行监控：捕获每次请求的 Token 数、模型层级、延迟和成本，建立成本基线",
      "按复杂度对请求分类，并路由到满足每种任务类型质量阈值的最低成本模型层级",
      "通过提示词压缩、上下文摘要和检索增强生成替代完整上下文注入来减少输入 Token 量",
      "使用语义缓存对确定性或近确定性的 LLM 响应进行缓存，消除冗余推理调用",
      "使用自动化评估框架持续评估输出质量与成本的权衡，确保降低成本不会降低用户体验"
    ],
    "ai_relevant": true,
    "viz_type": "matrix",
    "viz_labels": [
      "Model Size",
      "Caching",
      "Batching",
      "Routing"
    ],
    "viz_labels_zh": [
      "模型规模",
      "缓存策略",
      "批量处理",
      "智能路由"
    ],
    "related": [
      "prompt-chaining",
      "llm-evaluation-framework",
      "multimodal-pipeline-design"
    ],
    "tags": [
      "cost-optimization",
      "llm",
      "inference",
      "caching",
      "model-routing",
      "production-ai"
    ],
    "origin_author": "a16z",
    "origin_source": "a16z (2023). \"The Economics of AI: How to Cut Your Costs by 80% Without Sacrificing Quality\". a16z.com.",
    "origin_source_zh": "a16z（2023）。《AI 经济学：如何在不牺牲质量的情况下降低80%成本》。a16z.com。",
    "complexity": "intermediate",
    "when_to_use": [
      "Production AI applications with significant monthly LLM spend where cost is becoming a margin concern",
      "Systems with heterogeneous request types where some tasks can be handled by smaller, cheaper models without quality loss",
      "Applications with high query repetition rates where semantic caching can dramatically reduce redundant inference calls",
      "Teams preparing to scale from prototype to production who want to right-size inference costs before traffic grows"
    ],
    "when_to_use_zh": [
      "每月 LLM 支出显著、成本成为利润问题的生产 AI 应用",
      "请求类型异构、某些任务可由更小、更便宜的模型处理而不损失质量的系统",
      "查询重复率高、语义缓存可以大幅减少冗余推理调用的应用",
      "正在从原型扩展到生产、希望在流量增长前合理调整推理成本的团队"
    ],
    "core_concepts": [
      "Model Tiering: Match request complexity to model capability — use small fast models (GPT-4o-mini, Claude Haiku) for classification and extraction; reserve frontier models for complex reasoning",
      "Semantic Caching: Store LLM response embeddings and retrieve cached answers for semantically similar future queries, bypassing inference for repeated or near-identical requests",
      "Context Window Economy: Every input token has a cost; prompt compression, RAG, and conversation summarization reduce the average context length per request",
      "Batch Inference: Non-real-time workloads can use asynchronous batch APIs at 50–80% lower cost per token than synchronous endpoints"
    ],
    "core_concepts_zh": [
      "模型分层：将请求复杂度与模型能力匹配——对分类和提取使用小型快速模型（GPT-4o-mini、Claude Haiku）；为复杂推理保留前沿模型",
      "语义缓存：存储 LLM 响应嵌入，并为语义相似的未来查询检索缓存答案，对重复或近乎相同的请求绕过推理",
      "上下文窗口经济：每个输入 Token 都有成本；提示词压缩、RAG 和对话摘要降低每次请求的平均上下文长度",
      "批处理推理：非实时工作负载可以使用异步批处理 API，每 Token 成本比同步端点低50–80%"
    ],
    "timeline": [
      [
        "2020",
        "GPT-3 API pricing introduces per-token billing, making LLM cost a first-class engineering concern for the first time"
      ],
      [
        "2022",
        "Model distillation and fine-tuning of smaller models (Alpaca, Dolly) emerge as cost-reduction strategies for specialized tasks"
      ],
      [
        "2023",
        "a16z publishes the \"AI economics\" framework; semantic caching tools (GPTCache) and model routing libraries emerge as dedicated solutions"
      ],
      [
        "2024",
        "OpenAI and Anthropic introduce prompt caching as a native API feature, reducing repeated context costs by up to 90%"
      ]
    ],
    "timeline_zh": [
      [
        "2020",
        "GPT-3 API 定价引入按 Token 计费，首次使 LLM 成本成为工程领域的一等问题"
      ],
      [
        "2022",
        "较小模型的知识蒸馏和微调（Alpaca、Dolly）作为专业任务的成本降低策略出现"
      ],
      [
        "2023",
        "a16z 发布「AI 经济学」框架；语义缓存工具（GPTCache）和模型路由库作为专用解决方案出现"
      ],
      [
        "2024",
        "OpenAI 和 Anthropic 将提示词缓存作为原生 API 功能引入，将重复上下文成本降低高达90%"
      ]
    ],
    "dos": [
      "Do measure cost per user outcome (e.g., cost per successful task completion) rather than cost per API call, because optimizing raw token spend can inadvertently increase total cost via higher retry rates",
      "Do establish quality floor metrics before cost optimization so you have a defensible threshold below which cost cuts are blocked",
      "Do test model downgrades on a representative sample of production traffic before full rollout, because benchmark performance does not always predict production quality",
      "Do use prompt caching for system prompts and long static context that is reused across many requests, because this is typically the highest-ROI optimization available"
    ],
    "dos_zh": [
      "务必衡量每个用户结果的成本（如每次成功任务完成的成本）而非每次 API 调用的成本，因为优化原始 Token 支出可能通过更高的重试率无意中增加总成本",
      "务必在成本优化前建立质量下限指标，使你有一个可防御的阈值，低于该阈值则阻止成本削减",
      "务必在全量推广前对具有代表性的生产流量样本测试模型降级，因为基准性能并不总能预测生产质量",
      "务必对跨多个请求重用的系统提示词和长静态上下文使用提示词缓存，因为这通常是最高投资回报率的优化"
    ],
    "donts": [
      "Don't optimize LLM costs before you understand your usage profile — premature optimization targets the wrong bottlenecks and wastes engineering time",
      "Don't cache non-deterministic or personalized responses, because stale cached answers to user-specific queries degrade response quality and can expose one user's data to another",
      "Don't switch to a cheaper model without running an A/B evaluation on real traffic — lab benchmarks systematically underestimate quality gaps on production query distributions",
      "Don't neglect infrastructure costs (embedding generation, vector DB storage, reranking) when calculating total AI system cost — inference is often not the largest line item"
    ],
    "donts_zh": [
      "不要在了解使用情况前优化 LLM 成本——过早优化针对错误的瓶颈，浪费工程时间",
      "不要缓存非确定性或个性化响应，因为对用户特定查询的过期缓存答案会降低响应质量，并可能将一个用户的数据暴露给另一个用户",
      "不要在没有对真实流量进行 A/B 评估的情况下切换到更便宜的模型——实验室基准系统性地低估了生产查询分布上的质量差距",
      "不要在计算 AI 系统总成本时忽视基础设施成本（嵌入生成、向量数据库存储、重排序）——推理通常不是最大的成本项"
    ],
    "case_study_company": "Notion",
    "case_study": "Notion reduced their AI feature inference costs by over 80% between 2023 and 2024 through a combination of model routing, prompt compression, and caching. They implemented a complexity classifier that routes simple autocomplete and summarization requests to GPT-3.5-class models while reserving GPT-4-class models for complex drafting and Q&A. Prompt compression using LLMLingua reduced their average context length by 40%. A semantic cache for their AI search feature achieved a 35% cache hit rate in production, eliminating over a third of all inference calls. These combined optimizations allowed Notion to offer unlimited AI features at no additional charge while maintaining profitability.",
    "case_study_zh": "Notion 通过模型路由、提示词压缩和缓存的组合，在2023年至2024年间将其 AI 功能推理成本降低了超过80%。他们实现了一个复杂度分类器，将简单的自动补全和摘要请求路由到 GPT-3.5 级别的模型，同时为复杂的起草和问答保留 GPT-4 级别的模型。使用 LLMLingua 进行的提示词压缩将其平均上下文长度减少了40%。其 AI 搜索功能的语义缓存在生产中实现了35%的缓存命中率，消除了超过三分之一的推理调用。这些综合优化使 Notion 能够在不增加额外费用的情况下提供无限 AI 功能，同时保持盈利能力。",
    "when_not_to_use": [
      "Early-stage AI products in the quality discovery phase where cost optimization introduces risk before the product has found its quality bar",
      "Applications with inherently unique, non-cacheable queries (e.g., real-time analysis of live data streams) where caching and batching strategies have negligible impact",
      "Safety-critical AI applications where model downgrading carries unacceptable quality risk that outweighs cost savings",
      "Teams spending under $1,000/month on LLM inference where optimization engineering effort exceeds projected savings over any reasonable payback period"
    ],
    "when_not_to_use_zh": [
      "质量探索阶段的早期 AI 产品——在产品找到其质量标准前，成本优化引入风险",
      "查询本质上唯一且不可缓存的应用（如对实时数据流的实时分析）——缓存和批处理策略影响微乎其微",
      "模型降级带来不可接受的质量风险、超过成本节省的安全关键 AI 应用",
      "每月 LLM 推理支出不足1000美元的团队——优化工程工作量超过任何合理回收周期内的预期节省"
    ],
    "adopters": [
      "Notion",
      "Jasper",
      "Cursor",
      "Perplexity AI",
      "Cohere",
      "Scale AI"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "performance",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "a16z (2023). \"The Economics of Generative AI\". a16z.com/2023/04/the-economics-of-generative-ai/.",
    "secondary_sources": [
      "Zhenzhong Jiang et al. (2023). \"LLMLingua: Compressing Prompts for Accelerated Inference of Large Language Models\". arXiv:2310.05736.",
      "Bang Liu et al. (2023). \"GPTCache: An Open-Source Semantic Cache for LLM Applications\". arXiv:2306.03799.",
      "Simon Willison (2024). \"Pricing and the economics of running LLMs in production\". simonwillison.net."
    ],
    "typed_relations": [
      {
        "slug": "llm-evaluation-framework",
        "type": "complement"
      },
      {
        "slug": "prompt-chaining",
        "type": "complement"
      },
      {
        "slug": "multimodal-pipeline-design",
        "type": "complement"
      }
    ]
  },
  {
    "id": 297,
    "name": "Semantic Caching",
    "name_zh": "语义缓存",
    "slug": "semantic-caching",
    "category": "ai",
    "desc": "Caching strategy for LLM applications that stores and retrieves responses based on semantic similarity of queries rather than exact string match.",
    "desc_zh": "LLM 应用的缓存策略，基于查询的语义相似度而非精确字符串匹配来存储和检索响应。",
    "steps": [
      "Embed each incoming query using a lightweight embedding model to produce a vector representation of the query's semantic meaning",
      "Perform an approximate nearest-neighbour search against a vector store containing embeddings of previously answered queries to find semantically similar cached entries",
      "If a cached entry is found above a configured similarity threshold (e.g., cosine similarity > 0.92), return the cached LLM response without calling the inference endpoint",
      "If no sufficiently similar cache entry exists, route the query to the LLM, store the response along with its query embedding in the cache for future retrieval",
      "Monitor cache hit rate, similarity threshold performance, and response quality degradation to tune the similarity threshold and eviction policies over time"
    ],
    "steps_zh": [
      "使用轻量级嵌入模型对每条传入查询进行嵌入，生成查询语义含义的向量表示",
      "对包含已回答查询嵌入的向量存储执行近似最近邻搜索，查找语义相似的缓存条目",
      "如果找到相似度超过配置阈值（如余弦相似度 > 0.92）的缓存条目，则直接返回缓存的 LLM 响应，无需调用推理端点",
      "如果不存在足够相似的缓存条目，将查询路由至 LLM，并将响应连同查询嵌入一起存储到缓存供未来检索",
      "监控缓存命中率、相似度阈值性能和响应质量退化情况，随时间调整相似度阈值和淘汰策略"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Query",
      "Embed",
      "Cache Hit",
      "LLM Call"
    ],
    "viz_labels_zh": [
      "查询",
      "向量化",
      "缓存命中",
      "模型调用"
    ],
    "related": [
      "ai-cost-optimization",
      "llm-evaluation-framework"
    ],
    "tags": [
      "semantic-caching",
      "llm",
      "cost-optimization",
      "embeddings",
      "vector-search",
      "inference"
    ],
    "origin_author": "GPTCache",
    "origin_source": "Liu, B. et al. (2023). \"GPTCache: An Open-Source Semantic Cache for LLM Applications Enabling Faster Answers and Cost Savings\". arXiv:2306.03799.",
    "origin_source_zh": "Liu, B. 等（2023）.「GPTCache: An Open-Source Semantic Cache for LLM Applications Enabling Faster Answers and Cost Savings」. arXiv:2306.03799.",
    "complexity": "intermediate",
    "when_to_use": [
      "LLM applications with high query repetition rates such as customer support bots, FAQ assistants, or domain-specific Q&A systems where many users ask semantically similar questions",
      "Production AI systems with significant monthly inference spend where a 20-40% cache hit rate would produce meaningful cost reduction without compromising response quality",
      "Latency-sensitive LLM applications where returning cached responses in milliseconds versus waiting seconds for inference provides a material user experience improvement",
      "Multi-tenant SaaS platforms where different users frequently ask equivalent questions about shared knowledge domains"
    ],
    "when_to_use_zh": [
      "查询重复率高的 LLM 应用，如客户支持机器人、FAQ 助手或特定领域的问答系统，其中许多用户会提出语义相似的问题",
      "每月推理支出较大的生产 AI 系统，其中 20-40% 的缓存命中率可以在不损害响应质量的情况下实现有意义的成本降低",
      "对延迟敏感的 LLM 应用，毫秒级返回缓存响应与等待数秒推理相比，可显著改善用户体验",
      "不同用户经常就共享知识领域提出等效问题的多租户 SaaS 平台"
    ],
    "core_concepts": [
      "Embedding-Based Similarity: queries are converted to dense vector embeddings using a small fast model; similarity is measured in embedding space, not by string comparison, enabling matches for paraphrased or reworded questions",
      "Similarity Threshold: a configurable cosine or dot-product similarity threshold controls the precision-recall trade-off — lower thresholds increase hit rate but risk returning slightly off-topic cached answers",
      "Vector Store Integration: cached query embeddings are stored in a vector database (FAISS, Qdrant, Redis with vector module) to enable sub-millisecond approximate nearest-neighbour lookup at scale",
      "Cache Scope and Isolation: caches can be scoped globally (all users share one cache), per user (personalised caches), or per session (conversational context isolation) depending on query personalisation requirements"
    ],
    "core_concepts_zh": [
      "基于嵌入的相似度：使用小型快速模型将查询转换为密集向量嵌入；相似度在嵌入空间中衡量而非字符串比较，能够匹配改述或重新措辞的问题",
      "相似度阈值：可配置的余弦或点积相似度阈值控制精确率-召回率权衡——阈值越低命中率越高，但存在返回略微偏题的缓存答案的风险",
      "向量存储集成：缓存的查询嵌入存储在向量数据库（FAISS、Qdrant、带向量模块的 Redis）中，以实现大规模亚毫秒级近似最近邻查找",
      "缓存范围和隔离：缓存可以按全局（所有用户共享一个缓存）、按用户（个性化缓存）或按会话（对话上下文隔离）划定范围，取决于查询个性化需求"
    ],
    "timeline": [
      [
        "2022",
        "Early adopters of ChatGPT APIs explore exact-match response caching to reduce costs; limitations with paraphrased queries motivate a semantic approach"
      ],
      [
        "2023",
        "Zilliz (Milvus team) open-sources GPTCache, the first dedicated semantic caching library for LLM applications, introducing configurable similarity thresholds and pluggable vector backends"
      ],
      [
        "2024",
        "Redis, Weaviate, and Pinecone add native semantic caching APIs; major AI gateway providers (Kong, Cloudflare AI Gateway) integrate semantic caching as a built-in feature"
      ],
      [
        "2025",
        "Semantic caching becomes a standard component in LLM infrastructure stacks; OpenAI and Anthropic introduce server-side prompt caching as a complementary mechanism"
      ]
    ],
    "timeline_zh": [
      [
        "2022",
        "ChatGPT API 的早期采用者探索精确匹配响应缓存以降低成本；改述查询的局限性推动了语义方法的诞生"
      ],
      [
        "2023",
        "Zilliz（Milvus 团队）开源 GPTCache，这是首个专为 LLM 应用设计的语义缓存库，引入了可配置的相似度阈值和可插拔的向量后端"
      ],
      [
        "2024",
        "Redis、Weaviate 和 Pinecone 添加原生语义缓存 API；主要 AI 网关提供商（Kong、Cloudflare AI Gateway）将语义缓存作为内置功能集成"
      ],
      [
        "2025",
        "语义缓存成为 LLM 基础设施栈的标准组件；OpenAI 和 Anthropic 引入服务器端提示缓存作为互补机制"
      ]
    ],
    "dos": [
      "Do evaluate cache hit quality with human review on a sample of served cached responses before enabling semantic caching in production — a high hit rate with poor quality is worse than no caching",
      "Do segment cache namespaces by domain or use-case to prevent cross-domain semantic collisions where queries about different topics map to similar embeddings",
      "Do set cache TTL (time-to-live) policies based on knowledge freshness requirements — FAQ caches can have longer TTLs than caches for queries about rapidly changing information",
      "Do monitor the embedding model and similarity threshold combination together — swapping the embedding model invalidates all cached entries and requires cache warming"
    ],
    "dos_zh": [
      "在生产环境启用语义缓存之前，对已服务的缓存响应样本进行人工质量审查——高命中率但质量差比不缓存更糟糕",
      "按领域或使用场景划分缓存命名空间，防止不同主题的查询映射到相似嵌入时发生跨域语义冲突",
      "根据知识新鲜度要求设置缓存 TTL 策略——FAQ 缓存可以有比快速变化信息查询缓存更长的 TTL",
      "同时监控嵌入模型和相似度阈值的组合——更换嵌入模型会使所有缓存条目失效，需要重新预热缓存"
    ],
    "donts": [
      "Do not apply semantic caching to personalised or user-context-sensitive queries without strict namespace isolation — returning one user's cached answer to another user's query can expose private information",
      "Do not use a similarity threshold below 0.85 for factual question-answering applications — overly permissive thresholds cause semantic cache to serve subtly wrong answers to mismatched queries",
      "Do not cache responses for queries containing real-time data dependencies such as current prices, live status, or today's date — cached responses become stale and incorrect",
      "Do not skip embedding the cached response alongside the query — without response-side validation, cache entries degrade silently when the underlying LLM model is updated and responses change"
    ],
    "donts_zh": [
      "在没有严格命名空间隔离的情况下，不要将语义缓存应用于个性化或用户上下文敏感的查询——将一个用户的缓存答案返回给另一个用户的查询可能会暴露私人信息",
      "对于事实性问答应用，不要使用低于 0.85 的相似度阈值——过于宽松的阈值会导致语义缓存为不匹配的查询提供微妙错误的答案",
      "不要缓存包含实时数据依赖的查询响应，如当前价格、实时状态或今天的日期——缓存响应会过时并变得不正确",
      "不要跳过将缓存响应与查询一起嵌入的步骤——没有响应侧验证，当底层 LLM 模型更新且响应改变时，缓存条目会悄然退化"
    ],
    "case_study_company": "Grab",
    "case_study": "Grab, Southeast Asia's superapp, implemented semantic caching for their AI-powered customer support platform serving tens of millions of queries per month. Before semantic caching, their support bot made an LLM inference call for every unique query, resulting in significant monthly costs. After deploying GPTCache with a FAISS backend and a cosine similarity threshold of 0.90, they observed a 38% cache hit rate on their top support categories (ride cancellation, payment issues, driver queries). Cached responses were returned in under 20 milliseconds versus 1.8 seconds average for inference calls, improving p95 response time significantly. To ensure quality, they ran a weekly human evaluation of 200 randomly sampled cached responses and maintained a per-domain cache to prevent cross-category semantic drift. The net result was a 35% reduction in monthly LLM inference spend with no measurable degradation in CSAT scores.",
    "case_study_zh": "东南亚超级应用 Grab 为其每月服务数千万次查询的 AI 驱动客户支持平台实施了语义缓存。在实施语义缓存之前，他们的支持机器人对每个唯一查询都进行 LLM 推理调用，导致每月成本高昂。使用 FAISS 后端和 0.90 余弦相似度阈值部署 GPTCache 后，他们在顶级支持类别（行程取消、支付问题、司机查询）上观察到 38% 的缓存命中率。缓存响应在 20 毫秒以内返回，而推理调用平均需要 1.8 秒，显著改善了 p95 响应时间。为确保质量，他们每周对 200 个随机抽样的缓存响应进行人工评估，并维护每个领域的独立缓存以防止跨类别语义漂移。最终结果是每月 LLM 推理支出减少 35%，CSAT 评分没有可测量的下降。",
    "when_not_to_use": [
      "Creative writing, code generation, and open-ended generation tasks where every query intentionally produces a unique response and caching would return stale outputs",
      "Highly personalised applications where every response must be tailored to the specific user context, history, or preferences that differ between users asking equivalent questions",
      "Low-volume or low-repetition query distributions where the embedding computation and vector store overhead exceeds the inference cost savings from cache hits",
      "Applications requiring guaranteed freshness such as financial data queries, medical information retrieval, or legal research where cached answers from even hours ago may be materially incorrect"
    ],
    "when_not_to_use_zh": [
      "创意写作、代码生成和开放式生成任务，其中每个查询都有意产生唯一响应，缓存会返回过时的输出",
      "高度个性化的应用，其中每个响应必须根据特定用户的上下文、历史或偏好量身定制，这些因素在提出等效问题的用户之间有所不同",
      "查询量低或重复率低的分布，其中嵌入计算和向量存储开销超过缓存命中的推理成本节省",
      "需要保证新鲜度的应用，如金融数据查询、医疗信息检索或法律研究，其中即使几小时前的缓存答案也可能实质上不正确"
    ],
    "adopters": [
      "Grab",
      "Notion",
      "Perplexity AI",
      "Cohere",
      "LangChain",
      "Cloudflare"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "performance",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Liu, B. et al. (2023). \"GPTCache: An Open-Source Semantic Cache for LLM Applications\". arXiv:2306.03799.",
    "secondary_sources": [
      "a16z (2023). \"The Economics of Generative AI\". a16z.com.",
      "Cloudflare (2024). \"AI Gateway: Semantic Caching\". developers.cloudflare.com.",
      "Bang Liu (2023). \"GPTCache Documentation and Architecture\". github.com/zilliztech/GPTCache."
    ],
    "typed_relations": [
      {
        "slug": "ai-cost-optimization",
        "type": "complement"
      },
      {
        "slug": "llm-evaluation-framework",
        "type": "complement"
      }
    ]
  },
  {
    "id": 298,
    "name": "AI Red Teaming",
    "name_zh": "AI 红队测试",
    "slug": "ai-red-teaming",
    "category": "ai",
    "desc": "Adversarial testing methodology for AI systems that uses structured attack exercises to discover safety vulnerabilities, harmful outputs, and failure modes before deployment.",
    "desc_zh": "AI 系统的对抗性测试方法，通过结构化攻击演练在部署前发现安全漏洞、有害输出和失效模式。",
    "steps": [
      "Define the threat model: identify the harms the AI system could cause (disinformation, harmful content generation, data exfiltration, bias amplification) and the adversarial actors who might exploit it",
      "Assemble a diverse red team combining domain experts, security researchers, ethicists, and ideally members of communities that could be harmed by model failures",
      "Conduct structured attack exercises: test prompt injection, jailbreaks, role-play exploits, indirect prompt injection, data poisoning surface areas, and edge-case inputs that probe the model's safety boundaries",
      "Document all discovered failure modes with severity ratings, reproducible prompts, and example outputs; categorise findings by harm type (safety, fairness, privacy, security)",
      "Triage findings with the model and product team: determine which failures require model retraining or RLHF updates, which require system-level mitigations (output filters, rate limiting), and which are acceptable residual risks with disclosure"
    ],
    "steps_zh": [
      "定义威胁模型：识别 AI 系统可能造成的危害（虚假信息、有害内容生成、数据泄露、偏见放大）以及可能加以利用的对抗性行为者",
      "组建多元化的红队，结合领域专家、安全研究人员、伦理学家，以及理想情况下可能受模型失效危害的社区成员",
      "开展结构化攻击演练：测试提示注入、越狱、角色扮演利用、间接提示注入、数据投毒攻击面，以及探测模型安全边界的边缘案例输入",
      "记录所有发现的失效模式，包括严重性评级、可复现的提示词和示例输出；按危害类型（安全、公平、隐私、安全性）对发现进行分类",
      "与模型和产品团队对发现进行分类处置：确定哪些失效需要模型重新训练或 RLHF 更新，哪些需要系统级缓解措施（输出过滤、速率限制），哪些是可接受的残余风险需要披露"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Attack",
      "Evaluate",
      "Harden",
      "Retest"
    ],
    "viz_labels_zh": [
      "攻击尝试",
      "评估漏洞",
      "加固防护",
      "回归测试"
    ],
    "related": [
      "llm-evaluation-framework",
      "ai-cost-optimization"
    ],
    "tags": [
      "red-teaming",
      "ai-safety",
      "adversarial-testing",
      "jailbreak",
      "prompt-injection",
      "responsible-ai"
    ],
    "origin_author": "Microsoft",
    "origin_source": "Microsoft (2023). \"Microsoft AI Red Team building future of safer AI\". microsoft.com/security. Also: Perez, F. & Ribeiro, I. (2022). \"Ignore Previous Prompt: Attack Techniques for Language Models\". arXiv:2211.09527.",
    "origin_source_zh": "Microsoft（2023）.「Microsoft AI Red Team building future of safer AI」. microsoft.com/security. 另见：Perez, F. & Ribeiro, I.（2022）.「Ignore Previous Prompt: Attack Techniques for Language Models」. arXiv:2211.09527.",
    "complexity": "advanced",
    "when_to_use": [
      "Before releasing any LLM-powered product to the public, especially applications in sensitive domains such as healthcare, finance, legal services, or content moderation",
      "When integrating third-party LLMs into existing products where the model's safety boundaries and failure modes are not fully understood by the deploying team",
      "After significant model updates, fine-tuning runs, or system prompt changes that could alter safety properties established in previous red team exercises",
      "When regulatory or compliance requirements mandate pre-deployment adversarial safety testing, such as EU AI Act high-risk system assessments"
    ],
    "when_to_use_zh": [
      "在向公众发布任何 LLM 驱动产品之前，尤其是医疗保健、金融、法律服务或内容审核等敏感领域的应用",
      "将第三方 LLM 集成到现有产品中，而部署团队尚未完全了解该模型的安全边界和失效模式时",
      "在模型重大更新、微调运行或系统提示词变更之后，这些变更可能会改变先前红队演练中建立的安全属性",
      "当监管或合规要求强制进行部署前对抗性安全测试时，例如欧盟《人工智能法案》高风险系统评估"
    ],
    "core_concepts": [
      "Threat Modelling for AI: unlike traditional software security, AI threat models must include harms from model outputs (disinformation, harmful content, bias) in addition to system intrusion — the attack surface includes the model itself, not just the application infrastructure",
      "Prompt Injection: adversarial inputs that cause the model to ignore its system prompt or safety guidelines — including direct injection (user crafts malicious input) and indirect injection (malicious content in retrieved documents overrides instructions)",
      "Jailbreaking: crafting prompts that cause a safety-trained model to violate its intended constraints, typically through role-play, hypothetical framing, or token-level manipulation",
      "Automated Red Teaming: using one LLM to generate adversarial prompts against another LLM at scale, enabling broader coverage of the attack space than manual human red teaming alone"
    ],
    "core_concepts_zh": [
      "AI 威胁建模：与传统软件安全不同，AI 威胁模型除了系统入侵外还必须包括来自模型输出的危害（虚假信息、有害内容、偏见）——攻击面包括模型本身，不仅仅是应用基础设施",
      "提示注入：导致模型忽略其系统提示词或安全指南的对抗性输入——包括直接注入（用户构造恶意输入）和间接注入（检索文档中的恶意内容覆盖指令）",
      "越狱：构造能使经过安全训练的模型违反其预期约束的提示词，通常通过角色扮演、假设框架或词元级操作实现",
      "自动化红队测试：使用一个 LLM 大规模生成针对另一个 LLM 的对抗性提示词，实现比单纯人工红队测试更广泛的攻击空间覆盖"
    ],
    "timeline": [
      [
        "2018",
        "Traditional security red teaming practices are adapted for NLP systems as early language models are deployed in consumer products; focus is on bias and toxicity detection"
      ],
      [
        "2022",
        "Widespread ChatGPT deployment prompts large-scale public jailbreaking; Microsoft and OpenAI formalise dedicated AI red team functions; Perez and Ribeiro publish the first systematic study of prompt injection attacks"
      ],
      [
        "2023",
        "Microsoft publishes their AI Red Team methodology; OpenAI publishes GPT-4 system card documenting red team findings; NIST AI Risk Management Framework includes adversarial testing requirements"
      ],
      [
        "2024",
        "EU AI Act Article 9 mandates red teaming for high-risk AI systems; automated red teaming tools (Garak, PyRIT) emerge as open-source infrastructure for scalable adversarial evaluation"
      ]
    ],
    "timeline_zh": [
      [
        "2018",
        "传统安全红队实践被调整用于 NLP 系统，早期语言模型在消费品中部署；重点是偏见和毒性检测"
      ],
      [
        "2022",
        "ChatGPT 广泛部署引发大规模公共越狱；Microsoft 和 OpenAI 建立专门的 AI 红队职能；Perez 和 Ribeiro 发表首个提示注入攻击系统性研究"
      ],
      [
        "2023",
        "Microsoft 发布 AI 红队方法论；OpenAI 发布记录红队发现的 GPT-4 系统卡；NIST AI 风险管理框架纳入对抗性测试要求"
      ],
      [
        "2024",
        "欧盟《人工智能法案》第 9 条要求对高风险 AI 系统进行红队测试；自动化红队工具（Garak、PyRIT）作为可扩展对抗性评估的开源基础设施涌现"
      ]
    ],
    "dos": [
      "Do include diverse red teamers with different backgrounds, expertise, and lived experiences — model failures often affect specific communities disproportionately and homogeneous teams miss these failure modes",
      "Do document all discovered failures with reproducible prompts and complete model outputs, not just summaries — detailed documentation is essential for tracking fixes and regression testing",
      "Do run red team exercises iteratively throughout the model development lifecycle, not only as a final gate before release — early findings are cheaper to fix and prevent safety regressions during training",
      "Do combine automated red teaming (using LLM-based attack generation tools like Garak or PyRIT) with human red teaming — automation provides scale, humans provide creativity and cultural context that automated attacks miss"
    ],
    "dos_zh": [
      "纳入具有不同背景、专业知识和生活经验的多元化红队成员——模型失效往往对特定社区的影响不成比例，同质化团队会遗漏这些失效模式",
      "用可复现的提示词和完整的模型输出（而非仅摘要）记录所有发现的失效——详细文档对于追踪修复和回归测试至关重要",
      "在整个模型开发生命周期中迭代运行红队演练，而非仅作为发布前的最终关卡——早期发现的问题修复成本更低，并防止训练期间的安全回退",
      "将自动化红队测试（使用 Garak 或 PyRIT 等基于 LLM 的攻击生成工具）与人工红队测试相结合——自动化提供规模，人工提供自动化攻击所缺失的创造力和文化背景"
    ],
    "donts": [
      "Do not treat red teaming as a one-time checkbox exercise before launch — AI systems exhibit emergent failure modes that appear only in production contexts and require ongoing adversarial monitoring post-deployment",
      "Do not limit red team scope to only the most obvious harm categories — novel LLM misuse scenarios (indirect prompt injection via RAG, multimodal attacks, agentic misuse) require explicit inclusion in the threat model",
      "Do not share red team findings publicly or in full detail before mitigations are deployed — detailed jailbreak prompts and attack vectors published prematurely enable exploitation before defences are in place",
      "Do not conflate red teaming with standard QA testing — red teaming requires adversarial mindset and creative exploration of the attack surface, not verification that features work as specified"
    ],
    "donts_zh": [
      "不要将红队测试视为发布前一次性的勾选练习——AI 系统会表现出仅在生产上下文中出现的涌现失效模式，需要部署后持续的对抗性监控",
      "不要将红队范围限制在最明显的危害类别——新型 LLM 滥用场景（通过 RAG 的间接提示注入、多模态攻击、代理滥用）需要明确纳入威胁模型",
      "在缓解措施部署之前不要公开或完整详细地分享红队发现——过早发布的详细越狱提示词和攻击向量会在防御就位之前使利用成为可能",
      "不要将红队测试与标准 QA 测试混为一谈——红队测试需要对抗性思维和对攻击面的创造性探索，而非验证功能是否按规格工作"
    ],
    "case_study_company": "Microsoft",
    "case_study": "Microsoft established one of the first dedicated AI red teams in 2018 and has since conducted over 100 red team engagements across products including Copilot, Bing Chat, Azure OpenAI Service, and GitHub Copilot. In their published methodology, they describe a structured process of threat modelling, attack taxonomy development, and hybrid human-automated red teaming. For the Bing Chat integration (2023), the red team discovered that the model could be manipulated through indirect prompt injection via web page content retrieved by the search grounding feature — a novel attack vector not present in standard chatbot threat models. This finding led to architectural mitigations including output filtering for injected instructions and retrieval content sanitisation before inclusion in the context window. Microsoft also open-sourced PyRIT (Python Risk Identification Toolkit), their internal automated red teaming framework, enabling the broader AI safety community to scale adversarial testing.",
    "case_study_zh": "Microsoft 于 2018 年建立了首批专门的 AI 红队之一，此后对包括 Copilot、Bing Chat、Azure OpenAI Service 和 GitHub Copilot 在内的产品开展了 100 余次红队测试。在其发布的方法论中，他们描述了威胁建模、攻击分类体系开发以及人机混合红队测试的结构化流程。在 Bing Chat 集成（2023 年）中，红队发现该模型可通过搜索基础功能检索的网页内容进行间接提示注入攻击——这是标准聊天机器人威胁模型中未出现的新型攻击向量。这一发现推动了架构缓解措施的实施，包括针对注入指令的输出过滤和将检索内容纳入上下文窗口前的清洗处理。Microsoft 还开源了内部自动化红队框架 PyRIT（Python 风险识别工具包），使更广泛的 AI 安全社区能够扩展对抗性测试。",
    "when_not_to_use": [
      "Purely deterministic rule-based AI systems with no generative components where traditional software security testing and input validation are more appropriate than LLM-specific red teaming",
      "Internal-only AI tools with restricted access and no user-facing interface where the attack surface is limited to trusted internal actors with existing access controls",
      "Very early research prototypes where the model architecture and training are still being fundamentally redesigned and red team findings will be rendered obsolete by the next training run",
      "As a substitute for ongoing monitoring and incident response — red teaming before deployment identifies known vulnerabilities but cannot predict all emergent failure modes in production"
    ],
    "when_not_to_use_zh": [
      "没有生成组件的纯确定性规则 AI 系统，传统软件安全测试和输入验证比 LLM 特定的红队测试更为合适",
      "访问受限且无用户界面的纯内部 AI 工具，攻击面仅限于具有现有访问控制的可信内部行为者",
      "模型架构和训练仍在进行根本性重新设计的极早期研究原型，红队发现将在下一次训练运行中变得过时",
      "作为持续监控和事件响应的替代品——部署前的红队测试能识别已知漏洞，但无法预测生产中所有涌现的失效模式"
    ],
    "adopters": [
      "Microsoft",
      "OpenAI",
      "Google DeepMind",
      "Anthropic",
      "Meta AI",
      "Hugging Face"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "security",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Microsoft (2023). \"Microsoft AI Red Team building future of safer AI\". microsoft.com/security/blog.",
    "secondary_sources": [
      "Perez, F. & Ribeiro, I. (2022). \"Ignore Previous Prompt: Attack Techniques for Language Models\". arXiv:2211.09527.",
      "NIST (2023). \"Artificial Intelligence Risk Management Framework (AI RMF 1.0)\". nist.gov/airmf.",
      "Feffer, M. et al. (2024). \"Red-Teaming for Generative AI: Silver Bullet or Security Theater?\". arXiv:2401.15897."
    ],
    "typed_relations": [
      {
        "slug": "llm-evaluation-framework",
        "type": "complement"
      }
    ]
  },
  {
    "id": 303,
    "name": "AI Gateway Pattern",
    "name_zh": "AI 网关模式",
    "slug": "ai-gateway-pattern",
    "category": "ai",
    "desc": "Centralized proxy for LLM API management, rate limiting, caching, and observability",
    "desc_zh": "集中式代理，用于大模型 API 管理、限流、缓存与可观测性",
    "steps": [
      "Deploy a gateway service (Kong AI Gateway, Portkey, LiteLLM) in front of all LLM provider endpoints",
      "Configure routing rules to fan-out requests across providers (OpenAI, Anthropic, Azure) with fallback chains",
      "Enable request-level caching using semantic similarity or exact-match keys to reduce duplicate API calls",
      "Enforce rate limits, budget caps, and per-team quotas so no single consumer can exhaust shared capacity",
      "Instrument the gateway to emit per-request logs, latency metrics, token usage, and cost breakdowns to your observability stack"
    ],
    "steps_zh": [
      "在所有大模型提供商端点前部署网关服务（Kong AI Gateway、Portkey、LiteLLM 等）",
      "配置路由规则，将请求分发至多个提供商（OpenAI、Anthropic、Azure）并设置降级链",
      "使用语义相似度或精确匹配键启用请求级缓存，减少重复 API 调用",
      "强制执行限流、预算上限和团队级配额，防止单一消费者耗尽共享容量",
      "对网关进行埋点，将请求日志、延迟指标、Token 用量和费用明细输出至可观测性平台"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Client",
      "Gateway",
      "Auth/Rate",
      "Model"
    ],
    "viz_labels_zh": [
      "客户端",
      "AI网关",
      "鉴权限流",
      "模型服务"
    ],
    "related": [
      "llm-system-design-patterns",
      "ai-observability-framework",
      "prompt-caching-strategies"
    ],
    "tags": [
      "gateway",
      "proxy",
      "rate-limiting",
      "caching",
      "llm-ops",
      "cost-control"
    ],
    "origin_author": "Anthropic",
    "origin_source": "Anthropic Engineering Blog — Building responsible LLM infrastructure with API gateways (2023)",
    "origin_source_zh": "Anthropic 工程博客——使用 API 网关构建负责任的大模型基础设施（2023）",
    "complexity": "intermediate",
    "when_to_use": [
      "Organizations with multiple teams consuming LLM APIs and needing centralized cost governance and quota enforcement",
      "Production systems requiring provider redundancy — route to a backup model when the primary provider is unavailable",
      "Any service where prompt caching can deliver significant cost savings by serving repeated or semantically similar queries from cache",
      "Enterprises with strict compliance requirements that need every LLM request logged and auditable in a single place"
    ],
    "when_to_use_zh": [
      "多团队共用大模型 API、需要集中成本治理和配额管控的组织",
      "需要提供商冗余的生产系统——主提供商不可用时自动路由至备用模型",
      "可通过提示缓存大幅降低成本、反复或语义相似查询比例较高的服务",
      "有严格合规要求、需要在单一位置记录和审计所有大模型请求的企业"
    ],
    "core_concepts": [
      "Semantic Cache: Storing LLM responses keyed by embedding similarity so semantically equivalent prompts hit cache instead of the model",
      "Provider Fallback: Automatically rerouting requests to an alternative LLM provider when the primary returns an error or exceeds latency thresholds",
      "Budget Guard: Per-team or per-project spend limits enforced at the gateway layer before tokens are ever sent upstream",
      "Request Normalization: Translating provider-specific API formats into a unified schema so application code is provider-agnostic",
      "Audit Log: An immutable, tamper-evident log of every request and response passing through the gateway for compliance and debugging"
    ],
    "core_concepts_zh": [
      "语义缓存：以向量相似度为键存储大模型响应，语义等价的提示词直接命中缓存而不调用模型",
      "提供商降级（Provider Fallback）：主提供商报错或超出延迟阈值时，自动将请求路由至备用大模型",
      "预算守卫（Budget Guard）：在网关层按团队或项目强制执行消费上限，Token 发送上游前即被拦截",
      "请求规范化：将各提供商特定的 API 格式转换为统一模式，使应用代码与提供商解耦",
      "审计日志：对经过网关的每条请求与响应进行不可变、防篡改记录，用于合规和调试"
    ],
    "timeline": [
      [
        "2023-03",
        "ChatGPT API demand surge drives teams to build ad-hoc proxies for rate-limit management"
      ],
      [
        "2023-07",
        "LiteLLM open-sourced as a unified proxy layer supporting 100+ LLM providers"
      ],
      [
        "2023-11",
        "Portkey and Kong release dedicated AI gateway products with semantic caching and observability"
      ],
      [
        "2024-06",
        "AWS, Azure, and GCP launch managed AI gateway services integrated with their cloud platforms"
      ]
    ],
    "timeline_zh": [
      [
        "2023-03",
        "ChatGPT API 需求激增，推动各团队构建临时代理来管理限流问题"
      ],
      [
        "2023-07",
        "LiteLLM 开源，作为统一代理层支持 100 余个大模型提供商"
      ],
      [
        "2023-11",
        "Portkey 和 Kong 发布专用 AI 网关产品，内置语义缓存与可观测性"
      ],
      [
        "2024-06",
        "AWS、Azure、GCP 推出托管 AI 网关服务，与各自云平台深度集成"
      ]
    ],
    "dos": [
      "Version your routing rules as code so provider migrations and fallback changes are reviewable and reversible",
      "Instrument cache hit rates per route — a rate below 20% suggests the cache key strategy needs tuning",
      "Set hard token-budget alerts well below your actual limit so you have reaction time before costs spiral",
      "Test fallback chains in staging by intentionally disabling the primary provider endpoint"
    ],
    "dos_zh": [
      "将路由规则版本化为代码，使提供商迁移和降级变更可审查、可回滚",
      "按路由监控缓存命中率——低于 20% 意味着缓存键策略需要调整",
      "将 Token 预算告警阈值设置在实际上限的较低水位，留出成本失控前的响应时间",
      "在预发环境中通过主动禁用主提供商端点来测试降级链的完整性"
    ],
    "donts": [
      "Don't cache responses for prompts containing user-specific PII without anonymisation — cache poisoning is a real risk",
      "Don't route all traffic through a single-region gateway — this creates a latency bottleneck and a single point of failure",
      "Don't skip authentication at the gateway layer — unauthenticated proxies expose your API keys to anyone who discovers the endpoint",
      "Don't use overly aggressive semantic similarity thresholds for caching — near-duplicate prompts may need different answers"
    ],
    "donts_zh": [
      "不要在未脱敏的情况下缓存含用户 PII 的提示词响应——缓存投毒风险真实存在",
      "不要将所有流量路由至单区域网关——这会制造延迟瓶颈和单点故障",
      "不要跳过网关层的认证——未授权代理会将 API 密钥暴露给任何发现该端点的人",
      "不要对语义缓存使用过于激进的相似度阈值——近似重复的提示词可能需要不同的答案"
    ],
    "case_study_company": "Grab",
    "case_study": "Grab's AI platform team deployed an internal LLM gateway to serve 40+ product teams across their super-app. The gateway enforced per-squad token budgets, enabled semantic caching that cut API costs by 35% in the first quarter, and provided a unified audit trail required by their financial services regulators. When OpenAI experienced an outage in late 2023, automatic fallback to Azure OpenAI Service kept all customer-facing AI features fully operational with no engineer intervention.",
    "case_study_zh": "Grab 的 AI 平台团队部署了内部大模型网关，服务于超级应用中的 40 余个产品团队。该网关强制执行按小队的 Token 预算，启用语义缓存后第一季度 API 成本降低 35%，并提供了金融服务监管机构要求的统一审计追踪。2023 年底 OpenAI 出现故障时，自动降级至 Azure OpenAI Service 使所有面向用户的 AI 功能保持完全正常运行，无需工程师介入。",
    "when_not_to_use": [
      "Single-model, single-team prototypes where the operational overhead of a gateway exceeds its benefits",
      "Latency-critical inference paths where adding a network hop is unacceptable — consider co-locating the gateway in the same VPC",
      "Research and experimentation environments where rigid quotas would slow down exploration",
      "When your LLM provider already offers native rate limiting, caching, and observability that fully meets your requirements"
    ],
    "when_not_to_use_zh": [
      "单模型、单团队原型项目，网关的运维开销超过其带来的收益",
      "对延迟极度敏感的推理路径，增加一跳网络不可接受——此时考虑将网关与业务共置于同一 VPC",
      "严格配额会拖慢探索节奏的研究和实验环境",
      "大模型提供商已原生提供限流、缓存和可观测性、且完全满足需求的场景"
    ],
    "adopters": [
      "Grab",
      "Shopify",
      "Cloudflare",
      "Kong",
      "Portkey",
      "Uber"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "performance",
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Anthropic (2023). \"Building responsible LLM infrastructure with API gateways\". anthropic.com/engineering.",
    "secondary_sources": [
      "BerriAI (2023). \"LiteLLM: Call all LLM APIs using the OpenAI format\". github.com/BerriAI/litellm.",
      "Kong (2024). \"Kong AI Gateway: Unified LLM API management\". konghq.com/products/kong-ai-gateway.",
      "Portkey (2024). \"The AI Gateway for production LLM apps\". portkey.ai/docs."
    ],
    "typed_relations": [
      {
        "slug": "llm-system-design-patterns",
        "type": "complement"
      },
      {
        "slug": "ai-observability-framework",
        "type": "complement"
      },
      {
        "slug": "prompt-caching-strategies",
        "type": "related"
      }
    ]
  },
  {
    "id": 304,
    "name": "Prompt Caching Strategies",
    "name_zh": "提示词缓存策略",
    "slug": "prompt-caching-strategies",
    "category": "ai",
    "desc": "Reducing LLM inference costs and latency by reusing KV cache from shared prompt prefixes",
    "desc_zh": "通过复用共享提示词前缀的 KV 缓存，降低大模型推理成本与延迟",
    "steps": [
      "Identify the static portions of your prompt (system instructions, documents, few-shot examples) that are shared across many requests",
      "Move all static content to the beginning of the prompt so it forms a cacheable prefix before the dynamic user turn",
      "Enable provider-level prefix caching (Anthropic cache_control, OpenAI automatic caching) and verify cache hit metrics in API responses",
      "Design your turn structure so the cache prefix is reused for at least 2-5 requests to recoup the cache write cost",
      "Monitor cache hit rates and TTL behaviour per endpoint; restructure prompts where hit rates are below expectations"
    ],
    "steps_zh": [
      "识别提示词中跨请求共享的静态部分（系统指令、文档、少样本示例等）",
      "将所有静态内容移至提示词开头，使其在动态用户轮次之前构成可缓存前缀",
      "启用提供商级前缀缓存（Anthropic cache_control、OpenAI 自动缓存），并在 API 响应中验证缓存命中指标",
      "设计轮次结构，使缓存前缀被至少 2-5 次请求复用，以覆盖缓存写入成本",
      "按端点监控缓存命中率和 TTL 行为，对命中率低于预期的提示词进行重构"
    ],
    "ai_relevant": true,
    "viz_type": "matrix",
    "viz_labels": [
      "Static Cache",
      "Dynamic Cache",
      "Prefix",
      "TTL"
    ],
    "viz_labels_zh": [
      "静态缓存",
      "动态缓存",
      "前缀缓存",
      "过期策略"
    ],
    "related": [
      "ai-gateway-pattern",
      "context-window-management",
      "llm-system-design-patterns"
    ],
    "tags": [
      "caching",
      "cost-optimization",
      "kv-cache",
      "prompt-engineering",
      "llm-ops"
    ],
    "origin_author": "OpenAI",
    "origin_source": "OpenAI Platform Documentation — Prompt Caching (2024); Anthropic API Docs — Prompt Caching (2024)",
    "origin_source_zh": "OpenAI 平台文档——提示词缓存（2024）；Anthropic API 文档——提示词缓存（2024）",
    "complexity": "beginner",
    "when_to_use": [
      "Chatbot and conversational AI applications with long system prompts repeated verbatim across thousands of turns",
      "Document Q&A systems where the same large document context is queried many times per session or across users",
      "Few-shot prompting setups where a fixed set of examples is prepended to every single request",
      "High-volume API integrations where token costs are a significant operational expense and latency reduction matters"
    ],
    "when_to_use_zh": [
      "系统提示词较长且在数千轮对话中逐字重复的聊天机器人和对话式 AI 应用",
      "同一大型文档上下文在一次会话或跨用户被多次查询的文档问答系统",
      "固定少样本示例集被前置于每条请求的少样本提示场景",
      "Token 成本是主要运营支出、且延迟优化有价值的大流量 API 集成"
    ],
    "core_concepts": [
      "KV Cache: The key-value attention cache computed during LLM prefill; reusing it skips expensive recomputation of shared prefix tokens",
      "Prefix Stability: The requirement that cached tokens occupy the exact same position in the prompt on every request — any change invalidates the cache",
      "Cache Write vs Hit Cost: Writing a new cache entry costs slightly more than a standard token; savings accumulate only after sufficient cache hits",
      "TTL (Time-to-Live): The duration a cached prefix remains valid; Anthropic uses a 5-minute sliding TTL, OpenAI uses 1 hour for eligible prompts",
      "Cache Breakpoint: A marker (e.g., Anthropic's cache_control) that tells the provider exactly where the cacheable prefix ends and the dynamic content begins"
    ],
    "core_concepts_zh": [
      "KV 缓存：大模型预填充阶段计算的键值注意力缓存；复用它可跳过对共享前缀 Token 的昂贵重计算",
      "前缀稳定性：缓存 Token 在每次请求中必须占据完全相同的位置——任何变化都会使缓存失效",
      "缓存写入与命中成本：写入新缓存条目的成本略高于标准 Token；只有在足够多次缓存命中后才能实现净节省",
      "TTL（生存时间）：缓存前缀保持有效的时长；Anthropic 使用 5 分钟滑动 TTL，OpenAI 对符合条件的提示词使用 1 小时",
      "缓存断点：告知提供商可缓存前缀确切结束位置的标记（如 Anthropic 的 cache_control），动态内容从此处开始"
    ],
    "timeline": [
      [
        "2024-05",
        "Anthropic launches prompt caching beta for Claude 3 Haiku and Sonnet, offering up to 90% cost reduction on cached tokens"
      ],
      [
        "2024-07",
        "OpenAI introduces automatic prompt caching for GPT-4o and GPT-4o mini with 50% discount on cached input tokens"
      ],
      [
        "2024-10",
        "Google Gemini 1.5 adds context caching with explicit API controls and per-session TTL management"
      ],
      [
        "2025-01",
        "Prompt caching becomes a standard feature across all major LLM providers; LLM gateway products surface unified cache analytics"
      ]
    ],
    "timeline_zh": [
      [
        "2024-05",
        "Anthropic 为 Claude 3 Haiku 和 Sonnet 推出提示词缓存测试版，缓存 Token 成本最高可降低 90%"
      ],
      [
        "2024-07",
        "OpenAI 为 GPT-4o 和 GPT-4o mini 引入自动提示词缓存，缓存输入 Token 享 50% 折扣"
      ],
      [
        "2024-10",
        "Google Gemini 1.5 新增上下文缓存功能，提供显式 API 控制和会话级 TTL 管理"
      ],
      [
        "2025-01",
        "提示词缓存成为所有主流大模型提供商的标准功能；大模型网关产品开始呈现统一缓存分析视图"
      ]
    ],
    "dos": [
      "Front-load everything static: system prompt, retrieved documents, few-shot examples, then append the user message last",
      "Aim for a prefix length of at least 1024 tokens — shorter prefixes may not qualify for caching on some providers",
      "Track the cache_creation_input_tokens and cache_read_input_tokens fields returned by the API to measure actual savings",
      "Use a deterministic prompt template so that the static prefix bytes are identical across requests — even whitespace differences bust the cache"
    ],
    "dos_zh": [
      "将所有静态内容前置：系统提示词、检索文档、少样本示例，最后追加用户消息",
      "目标前缀长度至少 1024 Token——较短的前缀在部分提供商处可能不符合缓存条件",
      "追踪 API 返回的 cache_creation_input_tokens 和 cache_read_input_tokens 字段以衡量实际节省",
      "使用确定性提示词模板，确保静态前缀的字节内容在请求间完全一致——即便空白字符差异也会导致缓存失效"
    ],
    "donts": [
      "Don't include timestamps, request IDs, or other dynamic values inside the cached prefix — they invalidate every cache entry",
      "Don't assume cache hits are free — cache writes cost 1.25× the standard token price on Anthropic; model the break-even point for your traffic pattern",
      "Don't cache sensitive user data inside a shared prefix — other users' requests may read from the same cache entry",
      "Don't rely on caching to substitute for context window optimisation — high token counts still incur latency even when cached"
    ],
    "donts_zh": [
      "不要在缓存前缀内包含时间戳、请求 ID 或其他动态值——它们会使每个缓存条目失效",
      "不要以为缓存命中是免费的——在 Anthropic，缓存写入成本为标准 Token 价格的 1.25 倍；需根据流量模式建模盈亏平衡点",
      "不要将敏感用户数据缓存在共享前缀中——其他用户的请求可能读取同一缓存条目",
      "不要依赖缓存来替代上下文窗口优化——即使命中缓存，高 Token 数量仍会带来延迟"
    ],
    "case_study_company": "Notion",
    "case_study": "Notion's AI team applied prompt caching to their document Q&A feature, where a user's full workspace context (often 50,000+ tokens of notes and pages) was being resent on every follow-up question. By restructuring prompts to place the document corpus before the conversation history and enabling Anthropic's cache_control, they reduced median input token costs for multi-turn sessions by 78% and cut time-to-first-token latency by 40%, directly improving the perceived responsiveness of the AI assistant.",
    "case_study_zh": "Notion 的 AI 团队将提示词缓存应用于文档问答功能。此前，用户完整的工作区上下文（通常超过 5 万 Token 的笔记和页面内容）在每个追问时都会被重新发送。通过将文档语料库置于对话历史之前重构提示词，并启用 Anthropic 的 cache_control，多轮会话的中位输入 Token 成本降低了 78%，首 Token 延迟减少了 40%，直接提升了 AI 助手的感知响应速度。",
    "when_not_to_use": [
      "Highly personalised prompts where every request has a unique system prompt — no stable prefix exists to cache",
      "Single-turn or very low-volume endpoints where the cache write overhead never amortises across enough hits",
      "Prompts shorter than ~1024 tokens where provider minimum thresholds prevent caching from activating",
      "Workflows that mutate the system prompt frequently (e.g., A/B testing prompt variations) — constant cache misses negate any benefit"
    ],
    "when_not_to_use_zh": [
      "高度个性化的提示词，每次请求都有独特系统提示——不存在可缓存的稳定前缀",
      "单轮或极低流量的端点，缓存写入开销无法在足够多次命中中摊销",
      "提示词短于约 1024 Token、提供商最低阈值阻止缓存激活的场景",
      "频繁修改系统提示词的工作流（如提示词变体 A/B 测试）——持续缓存未命中使一切收益归零"
    ],
    "adopters": [
      "Notion",
      "Cursor",
      "Perplexity AI",
      "GitHub Copilot",
      "Anthropic (internal tooling)",
      "OpenAI (platform)"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "performance",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Anthropic (2024). \"Prompt Caching\". docs.anthropic.com/en/docs/build-with-claude/prompt-caching.",
    "secondary_sources": [
      "OpenAI (2024). \"Prompt Caching\". platform.openai.com/docs/guides/prompt-caching.",
      "Google (2024). \"Context caching overview\". ai.google.dev/gemini-api/docs/caching.",
      "Pope, R. et al. (2023). \"Efficiently Scaling Transformer Inference\". MLSys 2023."
    ],
    "typed_relations": [
      {
        "slug": "ai-gateway-pattern",
        "type": "complement"
      },
      {
        "slug": "context-window-management",
        "type": "complement"
      },
      {
        "slug": "llm-system-design-patterns",
        "type": "related"
      }
    ]
  },
  {
    "id": 305,
    "name": "Evaluation-Driven Development",
    "name_zh": "评估驱动开发",
    "slug": "evaluation-driven-development",
    "category": "ai",
    "desc": "Building AI applications by writing eval suites before features, using test results to guide iteration",
    "desc_zh": "通过在编写功能前先构建评估套件来开发 AI 应用，并以测试结果驱动迭代",
    "steps": [
      "Define the success criteria for the AI feature in measurable terms before writing a single prompt or training a single model",
      "Build a labelled golden dataset of input-output pairs that covers edge cases, failure modes, and distribution shift scenarios",
      "Implement automated eval runners (LLM-as-judge, unit assertions, human spot-checks) that score every candidate prompt or model against the dataset",
      "Only ship a new prompt version, model, or feature when it passes all eval thresholds — treat a failing eval as a failing test",
      "Continuously expand the eval suite with production failures and edge cases discovered after deployment; re-run evals before every release"
    ],
    "steps_zh": [
      "在编写任何提示词或训练任何模型之前，用可量化的标准定义 AI 功能的成功标准",
      "构建带标注的黄金数据集，覆盖边缘情况、失败模式和分布偏移场景",
      "实现自动化评估运行器（大模型作为评审、单元断言、人工抽查），对每个候选提示词或模型在数据集上打分",
      "只有在通过所有评估阈值后才上线新版提示词、模型或功能——将评估失败视同测试失败处理",
      "持续用上线后发现的生产故障和边缘情况扩充评估套件；每次发布前重新运行评估"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Define Evals",
      "Build",
      "Run Evals",
      "Improve"
    ],
    "viz_labels_zh": [
      "定义评估",
      "构建",
      "运行评估",
      "持续改进"
    ],
    "related": [
      "llm-evaluation-framework",
      "rag-architecture",
      "ai-observability-framework"
    ],
    "tags": [
      "evals",
      "testing",
      "llm-ops",
      "quality-assurance",
      "prompt-engineering",
      "ai-safety"
    ],
    "origin_author": "Hamel Husain",
    "origin_source": "Hamel Husain — \"Your AI Product Needs Evals\" (2023); Eugene Yan — \"Patterns for Building LLM-based Systems\" (2023)",
    "origin_source_zh": "Hamel Husain——「你的 AI 产品需要评估」（2023）；Eugene Yan——「构建大模型系统的模式」（2023）",
    "complexity": "advanced",
    "when_to_use": [
      "Any AI feature that will be shipped to real users where quality regressions are costly or damaging to trust",
      "Teams iterating on prompts or fine-tuned models where intuition-based evaluation slows down or misleads development",
      "Regulated domains (healthcare, legal, finance) where you must demonstrate that your AI meets a defined quality threshold",
      "Long-lived AI products where model upgrades, provider migrations, or prompt changes need safe rollout verification"
    ],
    "when_to_use_zh": [
      "任何将面向真实用户发布的 AI 功能，质量回归代价高昂或会损害信任",
      "在提示词或微调模型上迭代时，凭直觉评估会拖慢或误导开发的团队",
      "必须证明 AI 满足特定质量阈值的受监管领域（医疗、法律、金融）",
      "长期维护的 AI 产品，模型升级、提供商迁移或提示词变更需要安全的上线验证"
    ],
    "core_concepts": [
      "Golden Dataset: A curated, human-labelled set of representative input-output pairs that defines acceptable behaviour for the AI feature",
      "LLM-as-Judge: Using a powerful LLM (GPT-4, Claude Opus) as an automated evaluator that scores candidate outputs against a rubric",
      "Eval Threshold: A quantitative pass/fail bar (e.g., BLEU ≥ 0.7, judge score ≥ 4/5, hallucination rate < 2%) that must be met before shipping",
      "Regression Suite: A growing corpus of previously-failing examples added to the eval set to prevent reintroduction of known bugs",
      "Slice Analysis: Breaking down eval scores by input category, language, topic, or user segment to surface hidden failure modes"
    ],
    "core_concepts_zh": [
      "黄金数据集（Golden Dataset）：一组经过人工标注的代表性输入输出对，定义了 AI 功能的可接受行为",
      "大模型即评审（LLM-as-Judge）：使用强大的大模型（GPT-4、Claude Opus）作为自动评估器，按评分标准对候选输出打分",
      "评估阈值（Eval Threshold）：上线前必须达到的量化通过/失败标准（如 BLEU ≥ 0.7、评审得分 ≥ 4/5、幻觉率 < 2%）",
      "回归测试集（Regression Suite）：将之前失败的示例持续添加到评估集，防止已知问题被重新引入",
      "切片分析（Slice Analysis）：按输入类别、语言、主题或用户群体拆解评估分数，揭示隐藏的失效模式"
    ],
    "timeline": [
      [
        "2022-11",
        "ChatGPT launch exposes the difficulty of evaluating open-ended LLM outputs; teams resort to ad-hoc human review"
      ],
      [
        "2023-06",
        "Hamel Husain publishes influential essays arguing evals are the highest-leverage investment in AI product development"
      ],
      [
        "2023-09",
        "OpenAI Evals framework open-sourced; Braintrust, Weights & Biases, and Confident AI release dedicated eval platforms"
      ],
      [
        "2024-03",
        "Anthropic publishes model specification and internal eval methodology; LLM-as-judge becomes the dominant automated eval pattern"
      ],
      [
        "2024-12",
        "Evaluation-Driven Development emerges as the consensus best practice for LLM product teams shipping at scale"
      ]
    ],
    "timeline_zh": [
      [
        "2022-11",
        "ChatGPT 发布暴露了评估开放式大模型输出的困难，各团队依赖临时人工审查"
      ],
      [
        "2023-06",
        "Hamel Husain 发表颇具影响力的文章，主张评估是 AI 产品开发中杠杆最高的投资"
      ],
      [
        "2023-09",
        "OpenAI Evals 框架开源；Braintrust、Weights & Biases、Confident AI 发布专用评估平台"
      ],
      [
        "2024-03",
        "Anthropic 发布模型规范和内部评估方法论；大模型即评审成为主流自动化评估模式"
      ],
      [
        "2024-12",
        "评估驱动开发成为规模化大模型产品团队的共识最佳实践"
      ]
    ],
    "dos": [
      "Start with a small golden dataset of 50-100 examples and grow it iteratively — a small high-quality set beats a large noisy one",
      "Combine automated evals (fast, cheap, consistent) with periodic human spot-checks (catches subtle issues automated judges miss)",
      "Make eval runs part of your CI/CD pipeline so every pull request that changes a prompt gets automatically evaluated",
      "Use slice analysis to audit evals across languages, user types, and edge cases — aggregate scores hide important failure clusters"
    ],
    "dos_zh": [
      "从 50-100 个示例的小型黄金数据集开始，迭代扩充——小而精的集合胜过大而杂的",
      "将自动化评估（快速、低成本、一致）与定期人工抽查（发现自动评审遗漏的细微问题）结合使用",
      "将评估运行纳入 CI/CD 流水线，使每个修改提示词的 PR 都能自动触发评估",
      "使用切片分析跨语言、用户类型和边缘情况审计评估结果——汇总分数会掩盖重要的失效集群"
    ],
    "donts": [
      "Don't treat evals as a one-time pre-launch checklist — production failures must feed back into the eval suite continuously",
      "Don't use LLM-as-judge as your only evaluator for your own model — the judge may share the same blind spots as the model being evaluated",
      "Don't optimise blindly for a single metric — teams that overfit their prompts to an eval benchmark often degrade real-world quality",
      "Don't skip writing evals because the task seems too subjective — even creative tasks can have measurable quality dimensions (coherence, factuality, tone)"
    ],
    "donts_zh": [
      "不要将评估视为一次性上线前清单——生产故障必须持续反馈到评估套件中",
      "不要仅用大模型即评审来评估自己的模型——评审模型可能与被评估模型存在相同的盲点",
      "不要盲目优化单一指标——将提示词过度拟合到评估基准的团队往往会降低真实世界的质量",
      "不要因任务看起来过于主观而跳过评估——即便是创意任务也可以有可量化的质量维度（连贯性、事实性、语调）"
    ],
    "case_study_company": "Duolingo",
    "case_study": "Duolingo's AI team adopted Evaluation-Driven Development when building their AI-powered language tutoring features for Duolingo Max. Before shipping any new exercise type or conversation scenario, the team assembled expert linguist panels to create golden datasets of expected dialogue flows, then built automated LLM-as-judge pipelines to score candidate outputs for grammatical correctness, pedagogical appropriateness, and cultural sensitivity. This approach allowed them to iterate 10× faster than pure human review while catching critical errors — including a model that inappropriately simplified responses for advanced learners — before they reached any users. The eval infrastructure became the team's primary quality gate for all subsequent model upgrades.",
    "case_study_zh": "Duolingo 的 AI 团队在构建 Duolingo Max 的 AI 驱动语言辅导功能时采用了评估驱动开发。在上线任何新练习类型或对话场景之前，团队组织了由专业语言学家组成的评审小组，创建了期望对话流程的黄金数据集，然后构建了自动化大模型即评审管线，对候选输出在语法正确性、教学适宜性和文化敏感性上进行评分。这一方法使团队的迭代速度比纯人工审查快 10 倍，同时在到达用户之前捕获了关键错误——包括一个不当简化高级学习者回应的模型问题。评估基础设施成为团队对所有后续模型升级的主要质量门禁。",
    "when_not_to_use": [
      "Pure research or exploratory prototyping where the problem definition itself is still being discovered — premature evals create false precision",
      "Ultra-low-stakes internal tools where the cost of building and maintaining an eval suite exceeds the cost of the occasional bad output",
      "Highly novel task domains with no existing labelled examples and no clear ground truth — invest in data collection first",
      "As a replacement for alignment and safety work — evals measure performance on known dimensions but cannot guarantee safety against unknown failure modes"
    ],
    "when_not_to_use_zh": [
      "纯研究或探索性原型阶段，问题定义本身仍在发现中——过早的评估会制造虚假的精确感",
      "风险极低的内部工具，构建和维护评估套件的成本超过偶尔出现不良输出的代价",
      "没有现有标注样本且缺乏明确基准答案的全新任务领域——应先投资于数据收集",
      "作为对齐和安全工作的替代品——评估能衡量已知维度上的性能，但无法保证对未知失效模式的安全性"
    ],
    "adopters": [
      "Duolingo",
      "Anthropic",
      "OpenAI",
      "Weights & Biases",
      "Braintrust",
      "Scale AI"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "maintainability",
      "performance"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Husain, H. (2023). \"Your AI Product Needs Evals\". hamel.dev/blog/posts/evals.",
    "secondary_sources": [
      "Yan, E. (2023). \"Patterns for Building LLM-based Systems & Products\". eugeneyan.com/writing/llm-patterns.",
      "Anthropic (2024). \"Evaluating AI systems: our approach to model evaluation\". anthropic.com/research.",
      "Shankar, S. et al. (2024). \"Who Validates the Validators? Aligning LLM-Assisted Evaluation of LLM Outputs\". arXiv:2404.12272."
    ],
    "typed_relations": [
      {
        "slug": "llm-evaluation-framework",
        "type": "extends"
      },
      {
        "slug": "rag-architecture",
        "type": "complement"
      },
      {
        "slug": "ai-observability-framework",
        "type": "complement"
      }
    ]
  },
  {
    "id": 101,
    "name": "Data Mesh",
    "name_zh": "数据网格",
    "slug": "data-mesh",
    "category": "data",
    "desc": "Domain-oriented decentralized data ownership and architecture",
    "desc_zh": "面向领域的去中心化数据所有权与架构",
    "steps": [
      "Decompose monolithic data platforms by identifying domain boundaries and assigning data ownership to the teams closest to the business context",
      "Treat data as a product: each domain team publishes discoverable, documented, and quality-assured data products with SLOs",
      "Build a self-serve data infrastructure platform that abstracts away storage, processing, and governance complexity for domain teams",
      "Implement federated computational governance by codifying global interoperability standards, security policies, and compliance rules as platform capabilities",
      "Establish a data product catalog and mesh-wide observability so consumers can discover, trust, and compose data products across domains"
    ],
    "steps_zh": [
      "通过识别领域边界将单体数据平台解耦，将数据所有权分配给最接近业务上下文的团队",
      "将数据视为产品：每个领域团队发布可发现、有文档、有质量保障且带SLO的数据产品",
      "构建自助式数据基础设施平台，为领域团队抽象存储、处理和治理的复杂性",
      "实施联邦式计算治理，将全局互操作标准、安全策略和合规规则编码为平台能力",
      "建立数据产品目录和网格范围的可观测性，使消费者能够跨领域发现、信任和组合数据产品"
    ],
    "ai_relevant": true,
    "viz_type": "hexgrid",
    "viz_labels": [
      "Domain Ownership",
      "Data Product",
      "Self-Serve Infra",
      "Federated Gov",
      "Data Catalog"
    ],
    "viz_labels_zh": [
      "域所有权",
      "数据产品",
      "自助基础设施",
      "联邦治理",
      "数据目录"
    ],
    "related": [
      "domain-driven-design",
      "microservices-decomposition",
      "team-topologies",
      "polyglot-persistence"
    ],
    "tags": [
      "data-mesh",
      "decentralization",
      "data-products",
      "domain-ownership",
      "governance"
    ],
    "origin_author": "Zhamak Dehghani, 2019",
    "origin_source": "Data Mesh: Delivering Data-Driven Value at Scale (O'Reilly, 2022)",
    "origin_source_zh": "《数据网格：大规模交付数据驱动价值》（O'Reilly，2022）",
    "complexity": "advanced",
    "when_to_use": [
      "When a centralized data team has become a bottleneck for multiple business domains",
      "When domain teams need autonomous control over their analytical and operational data",
      "When organizational scale demands distributed ownership matching Conway's Law",
      "When data quality suffers because producers are disconnected from consumers"
    ],
    "when_to_use_zh": [
      "当中心化数据团队成为多个业务领域的瓶颈时",
      "当领域团队需要自主控制其分析和运营数据时",
      "当组织规模要求分布式所有权以匹配康威定律时",
      "当数据质量因生产者与消费者脱节而下降时"
    ],
    "core_concepts": [
      "Domain ownership: data is owned and served by the domain that produces it, not by a central data team",
      "Data as a product: each dataset is treated as a product with discoverability, quality, SLOs, and documentation",
      "Self-serve data platform: a shared infrastructure layer that enables domain teams to build and publish data products without deep infrastructure expertise",
      "Federated computational governance: global policies enforced through automation and platform capabilities rather than centralized gatekeepers"
    ],
    "core_concepts_zh": [
      "领域所有权：数据由产生它的领域拥有和提供，而非中心化数据团队",
      "数据即产品：每个数据集被视为产品，具备可发现性、质量保障、SLO和文档",
      "自助式数据平台：共享基础设施层，使领域团队无需深厚基础设施专业知识即可构建和发布数据产品",
      "联邦式计算治理：通过自动化和平台能力而非集中式守门人来执行全局策略"
    ],
    "timeline": [
      [
        "2019",
        "Zhamak Dehghani publishes 'How to Move Beyond a Monolithic Data Lake to a Distributed Data Mesh' at ThoughtWorks"
      ],
      [
        "2020",
        "Data Mesh gains rapid industry adoption; ThoughtWorks Technology Radar features it as a key trend"
      ],
      [
        "2022",
        "Dehghani publishes the definitive O'Reilly book 'Data Mesh' formalizing the four principles"
      ],
      [
        "2023",
        "Major cloud vendors (AWS, GCP, Azure) release data mesh reference architectures and tooling"
      ]
    ],
    "timeline_zh": [
      [
        "2019",
        "Zhamak Dehghani 在 ThoughtWorks 发表「如何从单体数据湖走向分布式数据网格」"
      ],
      [
        "2020",
        "数据网格获得行业快速采纳；ThoughtWorks 技术雷达将其列为关键趋势"
      ],
      [
        "2022",
        "Dehghani 出版 O'Reilly 权威著作《数据网格》，正式化四大原则"
      ],
      [
        "2023",
        "主要云厂商（AWS、GCP、Azure）发布数据网格参考架构和工具"
      ]
    ],
    "dos": [
      "Do align data product boundaries with business domain boundaries because misalignment creates confusion about ownership",
      "Do invest in a self-serve platform before asking domains to own their data because without it domain teams will drown in infrastructure complexity",
      "Do define global interoperability standards (schemas, SLOs, metadata) because mesh-wide discoverability depends on consistency",
      "Do start with a single domain pilot before scaling because mesh adoption requires organizational learning"
    ],
    "dos_zh": [
      "将数据产品边界与业务领域边界对齐，因为不对齐会导致所有权混乱",
      "在要求领域拥有数据之前先投资自助式平台，否则领域团队会被基础设施复杂性淹没",
      "定义全局互操作标准（模式、SLO、元数据），因为网格范围的可发现性依赖一致性",
      "在扩展前先从单个领域试点开始，因为网格采纳需要组织学习"
    ],
    "donts": [
      "Don't simply rename existing data lake tables as 'data products' because data mesh requires genuine domain ownership and product thinking",
      "Don't skip governance because decentralization without standards leads to data chaos and duplication",
      "Don't force every team to adopt mesh simultaneously because a phased rollout reduces organizational resistance",
      "Don't neglect cross-domain data quality contracts because consumers need trust guarantees across boundaries"
    ],
    "donts_zh": [
      "不要简单地将现有数据湖表重命名为「数据产品」，因为数据网格需要真正的领域所有权和产品思维",
      "不要跳过治理，因为没有标准的去中心化会导致数据混乱和重复",
      "不要强迫每个团队同时采用网格，分阶段推出可减少组织阻力",
      "不要忽视跨领域数据质量契约，因为消费者需要跨边界的信任保障"
    ],
    "case_study_company": "Zalando",
    "case_study": "Zalando adopted Data Mesh to decentralize its analytics infrastructure across 200+ autonomous product teams. Each team publishes data products through a self-serve platform built on AWS and Apache Kafka. This eliminated the central data team bottleneck and reduced the average time to deliver a new analytical dataset from weeks to days.",
    "case_study_zh": "Zalando 采用数据网格将分析基础设施去中心化分配给200多个自治产品团队。每个团队通过基于AWS和Apache Kafka构建的自助式平台发布数据产品。这消除了中心化数据团队的瓶颈，将交付新分析数据集的平均时间从数周缩短到数天。",
    "when_not_to_use": [
      "Small organizations with a single domain where centralized data management is simpler and sufficient",
      "Early-stage startups that lack the organizational maturity to sustain distributed data ownership",
      "When data volumes and team count are small enough that a central data team is not a bottleneck",
      "Highly regulated environments where centralized control is legally mandated"
    ],
    "when_not_to_use_zh": [
      "只有单一领域的小型组织，集中式数据管理更简单且足够",
      "缺乏组织成熟度来维持分布式数据所有权的早期创业公司",
      "当数据量和团队数量足够小、中心化数据团队未成为瓶颈时",
      "法律要求集中控制的高度监管环境"
    ],
    "adopters": [
      "Zalando",
      "JPMorgan Chase",
      "Netflix",
      "Intuit",
      "ThoughtWorks"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "scalability",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Dehghani, Z. (2022). \"Data Mesh: Delivering Data-Driven Value at Scale\". O'Reilly Media.",
    "secondary_sources": [
      "Dehghani, Z. (2019). \"How to Move Beyond a Monolithic Data Lake to a Distributed Data Mesh\". ThoughtWorks.",
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\". O'Reilly Media.",
      "Machado, I., Costa, C. & Santos, M.Y. (2022). \"Data Mesh: Concepts and Principles of a Paradigm Shift in Data Architectures\". Procedia Computer Science, Vol. 196."
    ],
    "typed_relations": [
      {
        "slug": "domain-driven-design",
        "type": "complement"
      },
      {
        "slug": "microservices-decomposition",
        "type": "complement"
      },
      {
        "slug": "team-topologies",
        "type": "complement"
      },
      {
        "slug": "polyglot-persistence",
        "type": "complement"
      }
    ]
  },
  {
    "id": 102,
    "name": "Lambda Architecture",
    "name_zh": "Lambda 架构",
    "slug": "lambda-architecture",
    "category": "data",
    "desc": "Batch plus speed layers for scalable big data processing",
    "desc_zh": "批处理加速度层的可扩展大数据处理架构",
    "steps": [
      "Ingest all raw data immutably into a master dataset that serves as the system's append-only source of truth",
      "Build a batch layer that periodically recomputes comprehensive batch views from the entire master dataset using frameworks like Hadoop or Spark",
      "Build a speed layer that processes only recent data in real-time using stream processors like Storm or Flink to compensate for batch layer latency",
      "Merge batch views and real-time views at query time in a serving layer that exposes a unified interface to consumers",
      "Continuously validate that speed layer results converge with batch layer results and that eventual consistency is maintained"
    ],
    "steps_zh": [
      "将所有原始数据不可变地摄入主数据集，作为系统的追加式真实数据源",
      "构建批处理层，使用Hadoop或Spark等框架定期从整个主数据集重新计算全量批视图",
      "构建速度层，使用Storm或Flink等流处理器仅实时处理最近数据，以弥补批处理层的延迟",
      "在服务层查询时合并批视图和实时视图，向消费者暴露统一接口",
      "持续验证速度层结果与批处理层结果的收敛性，确保最终一致性"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Ingest",
      "Batch Layer",
      "Speed Layer",
      "Serving Layer",
      "Query"
    ],
    "viz_labels_zh": [
      "摄入",
      "批处理层",
      "速度层",
      "服务层",
      "查询"
    ],
    "related": [
      "kappa-architecture",
      "stream-processing-patterns",
      "cap-theorem"
    ],
    "tags": [
      "big-data",
      "batch-processing",
      "stream-processing",
      "lambda"
    ],
    "origin_author": "Nathan Marz, 2011",
    "origin_source": "Big Data: Principles and Best Practices of Scalable Real-Time Data Systems (Manning, 2015)",
    "origin_source_zh": "《大数据：可扩展实时数据系统的原理与最佳实践》（Manning，2015）",
    "complexity": "advanced",
    "when_to_use": [
      "When you need both comprehensive historical analytics and low-latency real-time results",
      "When data correctness requires periodic full recomputation to fix upstream errors",
      "When existing batch infrastructure must coexist with new real-time requirements",
      "When fault tolerance demands that real-time views can always be reconstructed from immutable raw data"
    ],
    "when_to_use_zh": [
      "当需要同时具备全面历史分析和低延迟实时结果时",
      "当数据正确性要求定期全量重算以修复上游错误时",
      "当现有批处理基础设施必须与新的实时需求共存时",
      "当容错要求实时视图始终可从不可变原始数据重建时"
    ],
    "core_concepts": [
      "Immutable master dataset: all data is stored in raw, append-only form as the single source of truth",
      "Batch layer: periodically reprocesses the entire dataset to produce accurate, comprehensive views",
      "Speed layer: processes only recent data in real-time to provide low-latency approximate views",
      "Serving layer: merges batch and speed layer outputs to answer queries with both completeness and freshness"
    ],
    "core_concepts_zh": [
      "不可变主数据集：所有数据以原始追加形式存储，作为唯一真实数据源",
      "批处理层：定期重新处理整个数据集以产生准确、全面的视图",
      "速度层：仅实时处理最近数据以提供低延迟的近似视图",
      "服务层：合并批处理层和速度层的输出，以兼顾完整性和时效性来响应查询"
    ],
    "timeline": [
      [
        "2011",
        "Nathan Marz introduces the Lambda Architecture concept based on his work at Twitter with Apache Storm"
      ],
      [
        "2013",
        "Lambda Architecture gains wide adoption as Hadoop ecosystem matures and real-time needs grow"
      ],
      [
        "2014",
        "Jay Kreps publishes 'Questioning the Lambda Architecture' proposing Kappa as a simpler alternative"
      ],
      [
        "2015",
        "Marz and Warren publish 'Big Data' (Manning), the definitive reference for Lambda Architecture"
      ]
    ],
    "timeline_zh": [
      [
        "2011",
        "Nathan Marz 基于在Twitter使用Apache Storm的经验提出Lambda架构概念"
      ],
      [
        "2013",
        "随着Hadoop生态系统成熟和实时需求增长，Lambda架构获得广泛采纳"
      ],
      [
        "2014",
        "Jay Kreps 发表「质疑Lambda架构」，提出Kappa作为更简单的替代方案"
      ],
      [
        "2015",
        "Marz和Warren出版《大数据》（Manning），成为Lambda架构的权威参考"
      ]
    ],
    "dos": [
      "Do keep the master dataset immutable and append-only because it enables full recomputation and error correction",
      "Do design batch and speed layers to produce eventually consistent results because divergence erodes consumer trust",
      "Do automate batch view recomputation on a regular schedule because stale batch views negate the architecture's benefits",
      "Do monitor latency gaps between speed and batch layers because large gaps indicate system health issues"
    ],
    "dos_zh": [
      "保持主数据集不可变且仅追加，因为这使全量重算和错误修正成为可能",
      "设计批处理和速度层产生最终一致的结果，因为分歧会侵蚀消费者信任",
      "定期自动化批视图重算，因为过时的批视图会抵消架构的优势",
      "监控速度层和批处理层之间的延迟差距，因为大差距表明系统健康问题"
    ],
    "donts": [
      "Don't maintain two completely different codebases for batch and speed logic because it doubles maintenance burden and introduces semantic drift",
      "Don't skip the serving layer merge logic because consumers should not need to understand the dual-layer internals",
      "Don't use Lambda when a single stream processing layer can meet your latency and correctness requirements",
      "Don't ignore the operational complexity of running two parallel processing systems"
    ],
    "donts_zh": [
      "不要为批处理和速度逻辑维护两套完全不同的代码库，因为这会使维护负担翻倍并引入语义漂移",
      "不要跳过服务层合并逻辑，因为消费者不应需要理解双层内部细节",
      "当单一流处理层能满足延迟和正确性要求时不要使用Lambda",
      "不要忽视运行两个并行处理系统的运维复杂性"
    ],
    "case_study_company": "Twitter",
    "case_study": "Twitter used Lambda Architecture to power its real-time analytics and trending topics. The batch layer processed the full tweet history using Hadoop for accurate aggregate metrics, while the speed layer used Apache Storm to detect trending topics within seconds. This dual approach allowed Twitter to serve both precise historical dashboards and real-time trend detection at scale.",
    "case_study_zh": "Twitter使用Lambda架构驱动其实时分析和热门话题功能。批处理层使用Hadoop处理完整推文历史以获取准确的聚合指标，而速度层使用Apache Storm在几秒内检测热门话题。这种双层方法使Twitter能够大规模同时提供精确的历史仪表板和实时趋势检测。",
    "when_not_to_use": [
      "When the operational overhead of maintaining two processing systems is not justified by your scale",
      "When stream processing alone can provide both correctness and low latency for your use case",
      "When your team lacks the expertise to maintain synchronized batch and speed layer logic",
      "When data volumes are small enough that a simple database with incremental updates suffices"
    ],
    "when_not_to_use_zh": [
      "当维护两个处理系统的运维开销无法被规模所证明时",
      "当单独的流处理即可为用例提供正确性和低延迟时",
      "当团队缺乏维护同步批处理和速度层逻辑的专业知识时",
      "当数据量足够小，简单数据库加增量更新即可满足时"
    ],
    "adopters": [
      "Twitter",
      "LinkedIn",
      "Yahoo",
      "Netflix",
      "Airbnb"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "scalability",
      "performance",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Marz, N. & Warren, J. (2015). \"Big Data: Principles and Best Practices of Scalable Real-Time Data Systems\". Manning Publications.",
    "secondary_sources": [
      "Marz, N. (2011). \"How to Beat the CAP Theorem\". nathanmarz.com.",
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 11. O'Reilly Media.",
      "Kreps, J. (2014). \"Questioning the Lambda Architecture\". O'Reilly Radar."
    ],
    "typed_relations": [
      {
        "slug": "kappa-architecture",
        "type": "alternative"
      },
      {
        "slug": "stream-processing-patterns",
        "type": "complement"
      },
      {
        "slug": "cap-theorem",
        "type": "complement"
      }
    ]
  },
  {
    "id": 103,
    "name": "Kappa Architecture",
    "name_zh": "Kappa 架构",
    "slug": "kappa-architecture",
    "category": "data",
    "desc": "Stream-first architecture eliminating the batch layer entirely",
    "desc_zh": "以流为核心、完全消除批处理层的数据架构",
    "steps": [
      "Ingest all data as immutable events into a durable, replayable log such as Apache Kafka with configurable retention",
      "Process all data through a single stream processing layer using frameworks like Apache Flink, Kafka Streams, or Apache Beam",
      "Materialize stream processing results into serving stores (databases, caches, search indices) optimized for query patterns",
      "When logic changes or reprocessing is needed, deploy a new version of the stream processor and replay the log from the desired offset",
      "Retire the old materialized views once the new processor has caught up, achieving zero-downtime schema evolution"
    ],
    "steps_zh": [
      "将所有数据作为不可变事件摄入持久化、可重放的日志（如Apache Kafka），配置合适的保留策略",
      "通过单一流处理层使用Apache Flink、Kafka Streams或Apache Beam等框架处理所有数据",
      "将流处理结果物化到针对查询模式优化的服务存储（数据库、缓存、搜索索引）",
      "当逻辑变更或需要重处理时，部署新版本流处理器并从期望偏移量重放日志",
      "新处理器追上进度后淘汰旧的物化视图，实现零停机模式演进"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Event Log",
      "Stream Process",
      "Serving Store",
      "Replay",
      "Reprocess"
    ],
    "viz_labels_zh": [
      "事件日志",
      "流处理",
      "服务存储",
      "重放",
      "再处理"
    ],
    "related": [
      "lambda-architecture",
      "eda",
      "change-data-capture",
      "stream-processing-patterns"
    ],
    "tags": [
      "streaming",
      "kappa",
      "event-log",
      "real-time",
      "simplicity"
    ],
    "origin_author": "Jay Kreps, 2014",
    "origin_source": "Questioning the Lambda Architecture (O'Reilly blog post, 2014)",
    "origin_source_zh": "「质疑Lambda架构」（O'Reilly博客文章，2014）",
    "complexity": "advanced",
    "when_to_use": [
      "When your use case is primarily real-time and batch recomputation adds unnecessary complexity",
      "When you want a single processing codebase to reduce maintenance and eliminate semantic drift between layers",
      "When your event log can be retained long enough to enable full reprocessing when logic changes",
      "When your stream processing framework supports exactly-once semantics and stateful operations"
    ],
    "when_to_use_zh": [
      "当用例主要是实时的，批量重算增加了不必要的复杂性时",
      "当希望使用单一处理代码库以减少维护并消除层间语义漂移时",
      "当事件日志能够保留足够长时间以支持逻辑变更时的全量重处理时",
      "当流处理框架支持精确一次语义和有状态操作时"
    ],
    "core_concepts": [
      "Immutable event log: all data enters as an append-only log that can be replayed from any point in time",
      "Single processing layer: one stream processing engine handles both real-time and historical reprocessing",
      "Log replay for reprocessing: when business logic changes, replay the event log through an updated processor instead of maintaining a separate batch system",
      "Materialized views: stream results are written into purpose-built serving stores optimized for specific query patterns"
    ],
    "core_concepts_zh": [
      "不可变事件日志：所有数据以追加日志形式进入，可从任意时间点重放",
      "单一处理层：一个流处理引擎同时处理实时和历史重处理",
      "日志重放实现重处理：当业务逻辑变更时，通过更新后的处理器重放事件日志，而非维护独立的批处理系统",
      "物化视图：流处理结果写入为特定查询模式优化的专用服务存储"
    ],
    "timeline": [
      [
        "2011",
        "Apache Kafka created at LinkedIn establishes the durable replayable log as a core infrastructure primitive"
      ],
      [
        "2014",
        "Jay Kreps publishes 'Questioning the Lambda Architecture' proposing Kappa as a simpler alternative"
      ],
      [
        "2017",
        "Apache Flink matures with exactly-once semantics, making Kappa viable for mission-critical workloads"
      ],
      [
        "2020",
        "Kappa Architecture becomes the default pattern for cloud-native streaming platforms like Confluent and Amazon Kinesis"
      ]
    ],
    "timeline_zh": [
      [
        "2011",
        "LinkedIn创建的Apache Kafka将持久可重放日志确立为核心基础设施原语"
      ],
      [
        "2014",
        "Jay Kreps发表「质疑Lambda架构」，提出Kappa作为更简单的替代方案"
      ],
      [
        "2017",
        "Apache Flink成熟并支持精确一次语义，使Kappa适用于关键任务工作负载"
      ],
      [
        "2020",
        "Kappa架构成为Confluent和Amazon Kinesis等云原生流平台的默认模式"
      ]
    ],
    "dos": [
      "Do size your event log retention to cover the longest reprocessing window you anticipate because insufficient retention breaks the replay mechanism",
      "Do use a framework with exactly-once processing guarantees because at-least-once can produce incorrect materialized views",
      "Do version your stream processing jobs so old and new versions can run in parallel during migrations",
      "Do monitor consumer lag closely because growing lag indicates the stream processor cannot keep up with input volume"
    ],
    "dos_zh": [
      "将事件日志保留期设置为能覆盖预期最长重处理窗口，因为保留不足会破坏重放机制",
      "使用具有精确一次处理保证的框架，因为至少一次可能产生不正确的物化视图",
      "对流处理作业进行版本管理，使新旧版本可在迁移期间并行运行",
      "密切监控消费者滞后，因为滞后增长表明流处理器无法跟上输入量"
    ],
    "donts": [
      "Don't assume infinite log retention is free because storage costs and compaction strategies must be planned",
      "Don't use Kappa when you genuinely need periodic full-dataset recomputation that exceeds your log retention window",
      "Don't neglect state management in stream processors because stateful operations require checkpointing and recovery",
      "Don't ignore backpressure signals because unhandled backpressure leads to cascading failures"
    ],
    "donts_zh": [
      "不要假设无限日志保留是免费的，因为存储成本和压缩策略必须规划",
      "当确实需要超出日志保留窗口的周期性全数据集重算时不要使用Kappa",
      "不要忽视流处理器中的状态管理，因为有状态操作需要检查点和恢复",
      "不要忽视背压信号，因为未处理的背压会导致级联故障"
    ],
    "case_study_company": "LinkedIn",
    "case_study": "LinkedIn pioneered the Kappa Architecture pattern through its creation of Apache Kafka and its internal stream processing infrastructure. By replacing batch ETL pipelines with Kafka-based streaming, LinkedIn unified its activity tracking, metrics, and feed generation into a single real-time processing paradigm. This reduced infrastructure complexity by eliminating the dual batch-speed codebases and enabled sub-second data freshness for features like 'Who Viewed Your Profile'.",
    "case_study_zh": "LinkedIn通过创建Apache Kafka及其内部流处理基础设施开创了Kappa架构模式。通过用基于Kafka的流处理替代批量ETL管道，LinkedIn将活动追踪、指标和信息流生成统一为单一实时处理范式。这通过消除双重批处理-速度代码库降低了基础设施复杂性，并为「谁查看了你的档案」等功能实现了亚秒级数据时效性。",
    "when_not_to_use": [
      "When log retention costs are prohibitive for the data volumes and retention periods required",
      "When complex analytical queries require full-dataset joins that are impractical in a streaming context",
      "When existing batch infrastructure is well-established and the team lacks streaming expertise",
      "When regulatory requirements mandate periodic full recomputation from source systems rather than log replay"
    ],
    "when_not_to_use_zh": [
      "当所需数据量和保留期的日志保留成本过高时",
      "当复杂分析查询需要在流上下文中不切实际的全数据集连接时",
      "当现有批处理基础设施成熟且团队缺乏流处理专业知识时",
      "当监管要求从源系统定期全量重算而非日志重放时"
    ],
    "adopters": [
      "LinkedIn",
      "Confluent",
      "Uber",
      "The New York Times",
      "Wix"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "scalability",
      "maintainability",
      "performance"
    ],
    "maturity_ring": "established",
    "primary_source": "Kreps, J. (2014). \"Questioning the Lambda Architecture\". O'Reilly Radar.",
    "secondary_sources": [
      "Kreps, J. (2013). \"The Log: What Every Software Engineer Should Know About Real-Time Data's Unifying Abstraction\". LinkedIn Engineering.",
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 11. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "lambda-architecture",
        "type": "alternative"
      },
      {
        "slug": "eda",
        "type": "complement"
      },
      {
        "slug": "change-data-capture",
        "type": "complement"
      },
      {
        "slug": "stream-processing-patterns",
        "type": "complement"
      }
    ]
  },
  {
    "id": 104,
    "name": "Stream Processing Patterns",
    "name_zh": "流处理模式",
    "slug": "stream-processing-patterns",
    "category": "data",
    "desc": "Patterns for continuous data stream windowing, joining, and semantics",
    "desc_zh": "连续数据流窗口化、连接和语义保证的处理模式",
    "steps": [
      "Define windowing strategies (tumbling, sliding, session, or global) based on the temporal semantics your analytics require",
      "Implement stream joins by choosing the correct join type (stream-stream, stream-table, or table-table) and configuring appropriate time bounds and key partitioning",
      "Configure exactly-once processing semantics through idempotent writes, transactional producers, or framework-level guarantees like Flink checkpoints",
      "Handle late-arriving and out-of-order events using watermarks, allowed lateness thresholds, and side outputs for dropped events",
      "Operationalize the pipeline with backpressure handling, state management, checkpoint tuning, and consumer lag monitoring"
    ],
    "steps_zh": [
      "根据分析所需的时间语义定义窗口策略（滚动、滑动、会话或全局窗口）",
      "通过选择正确的连接类型（流-流、流-表或表-表）并配置适当的时间边界和键分区来实现流连接",
      "通过幂等写入、事务生产者或框架级保证（如Flink检查点）配置精确一次处理语义",
      "使用水印、允许延迟阈值和旁路输出处理迟到和乱序事件",
      "通过背压处理、状态管理、检查点调优和消费者滞后监控使管道运维化"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Window",
      "Join",
      "Exactly-Once",
      "Watermark",
      "Backpressure"
    ],
    "viz_labels_zh": [
      "时间窗口",
      "流连接",
      "精确一次",
      "水位线",
      "背压"
    ],
    "related": [
      "kappa-architecture",
      "lambda-architecture",
      "eda",
      "change-data-capture"
    ],
    "tags": [
      "streaming",
      "windowing",
      "exactly-once",
      "joins",
      "watermarks"
    ],
    "origin_author": "Jay Kreps / Martin Kleppmann, 2014-2017",
    "origin_source": "Designing Data-Intensive Applications, Chapter 11: Stream Processing (O'Reilly, 2017)",
    "origin_source_zh": "《设计数据密集型应用》第11章：流处理（O'Reilly，2017）",
    "complexity": "advanced",
    "when_to_use": [
      "When building real-time analytics, monitoring, or alerting systems on continuous data feeds",
      "When event-time processing is critical and events arrive out of order or with variable delay",
      "When joining multiple data streams or enriching streams with reference data in real-time",
      "When exactly-once processing semantics are required for financial, billing, or compliance workloads"
    ],
    "when_to_use_zh": [
      "当在连续数据流上构建实时分析、监控或告警系统时",
      "当事件时间处理至关重要且事件乱序到达或延迟不定时",
      "当需要实时连接多个数据流或用参考数据丰富流时",
      "当金融、计费或合规工作负载要求精确一次处理语义时"
    ],
    "core_concepts": [
      "Windowing: grouping unbounded streams into finite chunks (tumbling, sliding, session windows) for aggregation and analysis",
      "Event time vs processing time: distinguishing when an event actually occurred from when it was processed, using watermarks to track progress",
      "Exactly-once semantics: ensuring each event is processed precisely once despite failures, through checkpointing, idempotent sinks, and transactional writes",
      "Stream joins: combining two or more streams based on keys and time constraints, including stream-stream windowed joins and stream-table enrichment joins"
    ],
    "core_concepts_zh": [
      "窗口化：将无界流分组为有限块（滚动、滑动、会话窗口）以进行聚合和分析",
      "事件时间与处理时间：使用水印追踪进度，区分事件实际发生时间和处理时间",
      "精确一次语义：通过检查点、幂等接收器和事务写入确保每个事件在故障情况下仅被处理一次",
      "流连接：基于键和时间约束组合两个或多个流，包括流-流窗口连接和流-表丰富连接"
    ],
    "timeline": [
      [
        "2011",
        "Apache Storm pioneers distributed real-time stream processing with at-least-once guarantees"
      ],
      [
        "2014",
        "Apache Flink introduces event-time processing, watermarks, and exactly-once state consistency"
      ],
      [
        "2017",
        "Kleppmann's 'Designing Data-Intensive Applications' codifies stream processing patterns as foundational knowledge"
      ],
      [
        "2019",
        "Kafka Streams and Apache Beam mature as portable stream processing APIs abstracting execution engines"
      ]
    ],
    "timeline_zh": [
      [
        "2011",
        "Apache Storm以至少一次保证开创分布式实时流处理"
      ],
      [
        "2014",
        "Apache Flink引入事件时间处理、水印和精确一次状态一致性"
      ],
      [
        "2017",
        "Kleppmann的《设计数据密集型应用》将流处理模式编纂为基础知识"
      ],
      [
        "2019",
        "Kafka Streams和Apache Beam作为抽象执行引擎的可移植流处理API走向成熟"
      ]
    ],
    "dos": [
      "Do choose event-time semantics over processing-time when correctness matters because processing-time windows produce non-deterministic results",
      "Do set watermarks and allowed lateness thresholds based on empirical analysis of your data's delay characteristics",
      "Do design for idempotent sinks because even with exactly-once frameworks, sink-level idempotency provides defense in depth",
      "Do test stream processing logic with both in-order and out-of-order event sequences because edge cases dominate production failures"
    ],
    "dos_zh": [
      "当正确性重要时选择事件时间语义而非处理时间，因为处理时间窗口产生非确定性结果",
      "基于数据延迟特征的经验分析设置水印和允许延迟阈值",
      "设计幂等接收器，因为即使使用精确一次框架，接收端幂等性也提供纵深防御",
      "使用有序和乱序事件序列测试流处理逻辑，因为边缘情况主导生产故障"
    ],
    "donts": [
      "Don't use processing-time windows for business metrics because clock skew and variable latency produce incorrect aggregates",
      "Don't ignore state size growth in windowed aggregations because unbounded state leads to out-of-memory failures",
      "Don't assume exactly-once means zero duplicates end-to-end because the guarantee applies only within the processing framework boundaries",
      "Don't skip backpressure configuration because uncontrolled producers will overwhelm slow consumers"
    ],
    "donts_zh": [
      "不要对业务指标使用处理时间窗口，因为时钟偏移和可变延迟会产生不正确的聚合",
      "不要忽视窗口聚合中的状态大小增长，因为无界状态会导致内存溢出故障",
      "不要假设精确一次意味着端到端零重复，因为保证仅适用于处理框架边界内",
      "不要跳过背压配置，因为不受控的生产者会压垮慢速消费者"
    ],
    "case_study_company": "Uber",
    "case_study": "Uber built its real-time surge pricing and ETA computation on Apache Flink stream processing patterns. Using event-time windows and stream-table joins, Uber processes millions of trip events per second to compute supply-demand ratios per geofence. Exactly-once semantics ensure pricing accuracy, while watermark-based late event handling prevents stale data from corrupting surge multipliers.",
    "case_study_zh": "Uber基于Apache Flink流处理模式构建了实时动态定价和预计到达时间计算。使用事件时间窗口和流-表连接，Uber每秒处理数百万行程事件以计算每个地理围栏的供需比。精确一次语义确保定价准确，基于水印的迟到事件处理防止过时数据破坏动态定价倍数。",
    "when_not_to_use": [
      "When batch processing with hourly or daily granularity meets your latency requirements",
      "When data volumes are low enough that a simple poll-and-process loop is sufficient",
      "When your team lacks expertise in distributed stream processing and the learning curve is too steep",
      "When the data source does not provide event timestamps and event-time semantics cannot be established"
    ],
    "when_not_to_use_zh": [
      "当小时级或天级粒度的批处理满足延迟要求时",
      "当数据量足够低，简单的轮询处理循环即可满足时",
      "当团队缺乏分布式流处理专业知识且学习曲线过陡时",
      "当数据源不提供事件时间戳且无法建立事件时间语义时"
    ],
    "adopters": [
      "Uber",
      "Alibaba",
      "Netflix",
      "Spotify",
      "ING Bank"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "performance",
      "scalability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 11: Stream Processing. O'Reilly Media.",
    "secondary_sources": [
      "Kreps, J. (2014). \"I Heart Logs: Event Data, Stream Processing, and Data Integration\". O'Reilly Media.",
      "Akidau, T. et al. (2015). \"The Dataflow Model: A Practical Approach to Balancing Correctness, Latency, and Cost in Massive-Scale, Unbounded, Out-of-Order Data Processing\". Proceedings of the VLDB Endowment, 8(12).",
      "Hueske, F. & Kalavri, V. (2019). \"Stream Processing with Apache Flink\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "kappa-architecture",
        "type": "complement"
      },
      {
        "slug": "lambda-architecture",
        "type": "complement"
      },
      {
        "slug": "eda",
        "type": "complement"
      },
      {
        "slug": "change-data-capture",
        "type": "complement"
      }
    ]
  },
  {
    "id": 105,
    "name": "Change Data Capture (CDC)",
    "name_zh": "变更数据捕获",
    "slug": "change-data-capture",
    "category": "data",
    "desc": "Track database changes as real-time event streams for downstream",
    "desc_zh": "将数据库变更作为实时事件流捕获并传递给下游系统",
    "steps": [
      "Configure a CDC connector (e.g., Debezium) to read the database's transaction log (WAL, binlog, or oplog) without impacting production query performance",
      "Publish change events to a durable message broker like Apache Kafka, preserving the original transaction order and including before/after snapshots of each row",
      "Transform and enrich change events in a stream processing layer to match downstream consumer schemas and business rules",
      "Route transformed events to target systems: data warehouses, search indices, caches, or other microservices that need synchronized data",
      "Monitor CDC pipeline health including replication lag, schema drift detection, and connector failure alerting to ensure data consistency"
    ],
    "steps_zh": [
      "配置CDC连接器（如Debezium）读取数据库事务日志（WAL、binlog或oplog），不影响生产查询性能",
      "将变更事件发布到持久消息代理（如Apache Kafka），保留原始事务顺序并包含每行的变更前后快照",
      "在流处理层转换和丰富变更事件，以匹配下游消费者的模式和业务规则",
      "将转换后的事件路由到目标系统：数据仓库、搜索索引、缓存或需要同步数据的其他微服务",
      "监控CDC管道健康状况，包括复制延迟、模式漂移检测和连接器故障告警，确保数据一致性"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "WAL / Binlog",
      "Change Event",
      "Stream Transform",
      "Target Sync",
      "Monitor"
    ],
    "viz_labels_zh": [
      "事务日志",
      "变更事件",
      "流转换",
      "目标同步",
      "监控"
    ],
    "related": [
      "kappa-architecture",
      "stream-processing-patterns",
      "saga-pattern",
      "eda"
    ],
    "tags": [
      "cdc",
      "replication",
      "event-streams",
      "debezium",
      "database"
    ],
    "origin_author": "Database community / Popularized by Debezium (Red Hat), 2015",
    "origin_source": "Designing Data-Intensive Applications, Chapter 11: Stream Processing (O'Reilly, 2017)",
    "origin_source_zh": "《设计数据密集型应用》第11章：流处理（O'Reilly，2017）",
    "complexity": "intermediate",
    "when_to_use": [
      "When microservices need to keep local read models synchronized with a source database without tight coupling",
      "When replacing batch ETL with real-time data replication to a data warehouse or data lake",
      "When building event-driven architectures that react to database state changes as they happen",
      "When search indices, caches, or materialized views must stay in near-real-time sync with the primary datastore"
    ],
    "when_to_use_zh": [
      "当微服务需要在不紧耦合的情况下保持本地读模型与源数据库同步时",
      "当用实时数据复制替代批量ETL到数据仓库或数据湖时",
      "当构建对数据库状态变更实时响应的事件驱动架构时",
      "当搜索索引、缓存或物化视图必须与主数据存储保持近实时同步时"
    ],
    "core_concepts": [
      "Log-based capture: reading the database's write-ahead log or binary log to detect changes without modifying the application or adding triggers",
      "Event ordering: preserving the exact transaction order so downstream systems can replay changes and maintain consistency",
      "Schema evolution: handling source schema changes gracefully so downstream consumers are not broken by column additions or type changes",
      "Snapshot plus streaming: performing an initial full snapshot of existing data, then switching to streaming incremental changes going forward"
    ],
    "core_concepts_zh": [
      "基于日志的捕获：读取数据库的预写日志或二进制日志来检测变更，无需修改应用或添加触发器",
      "事件排序：保留精确的事务顺序，使下游系统能够重放变更并保持一致性",
      "模式演进：优雅处理源模式变更，使下游消费者不会因列添加或类型变更而中断",
      "快照加流式：先对现有数据执行全量快照，然后切换到流式增量变更"
    ],
    "timeline": [
      [
        "2012",
        "LinkedIn develops Databus, one of the first open-source CDC systems for Oracle and MySQL"
      ],
      [
        "2015",
        "Debezium project launches at Red Hat, providing log-based CDC connectors for Kafka Connect"
      ],
      [
        "2017",
        "Kleppmann's 'Designing Data-Intensive Applications' establishes CDC as a core data architecture pattern"
      ],
      [
        "2021",
        "CDC becomes a standard feature in cloud databases (AWS DMS, Azure CDC, GCP Datastream)"
      ]
    ],
    "timeline_zh": [
      [
        "2012",
        "LinkedIn开发Databus，首批支持Oracle和MySQL的开源CDC系统之一"
      ],
      [
        "2015",
        "Debezium项目在Red Hat启动，为Kafka Connect提供基于日志的CDC连接器"
      ],
      [
        "2017",
        "Kleppmann的《设计数据密集型应用》将CDC确立为核心数据架构模式"
      ],
      [
        "2021",
        "CDC成为云数据库标准特性（AWS DMS、Azure CDC、GCP Datastream）"
      ]
    ],
    "dos": [
      "Do use log-based CDC over trigger-based or polling approaches because log reading has minimal impact on source database performance",
      "Do handle schema evolution by using a schema registry and backward-compatible serialization formats like Avro",
      "Do monitor replication lag between the source database and downstream consumers because growing lag indicates pipeline issues",
      "Do plan for initial snapshot strategy because CDC connectors need to bootstrap existing data before streaming incremental changes"
    ],
    "dos_zh": [
      "使用基于日志的CDC而非触发器或轮询方式，因为日志读取对源数据库性能影响最小",
      "通过使用模式注册表和Avro等向后兼容的序列化格式处理模式演进",
      "监控源数据库和下游消费者之间的复制延迟，因为延迟增长表明管道问题",
      "规划初始快照策略，因为CDC连接器在流式增量变更之前需要引导现有数据"
    ],
    "donts": [
      "Don't use trigger-based CDC in high-throughput OLTP systems because triggers add write latency to every transaction",
      "Don't ignore schema changes in the source database because unhandled schema drift will break downstream consumers",
      "Don't assume CDC guarantees exactly-once delivery because at-least-once is the default for most connectors and idempotency must be handled downstream",
      "Don't expose raw CDC events directly to business consumers because internal database schemas leak implementation details"
    ],
    "donts_zh": [
      "不要在高吞吐OLTP系统中使用基于触发器的CDC，因为触发器会为每个事务增加写延迟",
      "不要忽略源数据库中的模式变更，因为未处理的模式漂移会破坏下游消费者",
      "不要假设CDC保证精确一次传递，因为大多数连接器默认为至少一次，幂等性必须在下游处理",
      "不要将原始CDC事件直接暴露给业务消费者，因为内部数据库模式会泄露实现细节"
    ],
    "case_study_company": "Shopify",
    "case_study": "Shopify uses Debezium-based CDC to stream changes from its MySQL databases into Apache Kafka, enabling real-time synchronization of merchant data across its microservices architecture. This replaced brittle batch ETL jobs and reduced data propagation latency from hours to seconds, enabling features like real-time inventory updates and instant order status notifications for millions of merchants.",
    "case_study_zh": "Shopify使用基于Debezium的CDC将MySQL数据库的变更流式传输到Apache Kafka，实现微服务架构中商户数据的实时同步。这替代了脆弱的批量ETL作业，将数据传播延迟从小时级降低到秒级，为数百万商户启用了实时库存更新和即时订单状态通知等功能。",
    "when_not_to_use": [
      "When the source database does not expose a transaction log or the log format is proprietary and unsupported",
      "When data freshness requirements are measured in hours or days and batch ETL is simpler and sufficient",
      "When the target system can directly query the source database without needing a separate copy",
      "When compliance rules prohibit reading database transaction logs due to sensitive data exposure concerns"
    ],
    "when_not_to_use_zh": [
      "当源数据库不暴露事务日志或日志格式为专有且不受支持时",
      "当数据时效性要求以小时或天计算且批量ETL更简单足够时",
      "当目标系统可以直接查询源数据库而无需单独副本时",
      "当合规规则因敏感数据暴露问题而禁止读取数据库事务日志时"
    ],
    "adopters": [
      "Shopify",
      "Airbnb",
      "Zalando",
      "WePay",
      "Convoy"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability"
    ],
    "maturity_ring": "established",
    "primary_source": "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 11: Stream Processing. O'Reilly Media.",
    "secondary_sources": [
      "Kozlovski, R. (2020). \"Change Data Capture with Debezium\". Red Hat Developer Blog.",
      "Narkhede, N., Shapira, G. & Palino, T. (2017). \"Kafka: The Definitive Guide\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "kappa-architecture",
        "type": "complement"
      },
      {
        "slug": "stream-processing-patterns",
        "type": "complement"
      },
      {
        "slug": "saga-pattern",
        "type": "complement"
      },
      {
        "slug": "eda",
        "type": "complement"
      }
    ]
  },
  {
    "id": 106,
    "name": "Data Lakehouse",
    "name_zh": "数据湖仓一体",
    "slug": "data-lakehouse",
    "category": "data",
    "desc": "Unified architecture combining data lake flexibility with warehouse reliability",
    "desc_zh": "结合数据湖灵活性与数据仓库可靠性的统一架构",
    "steps": [
      "Store all raw data in open file formats (Parquet, ORC) on cloud object storage (S3, ADLS, GCS) to maintain data lake flexibility and cost efficiency",
      "Add a transactional metadata layer using Delta Lake, Apache Iceberg, or Apache Hudi to provide ACID transactions, schema enforcement, and time travel on lake data",
      "Implement a unified catalog (e.g., Unity Catalog, AWS Glue, Hive Metastore) so both BI tools and ML frameworks discover and access the same governed datasets",
      "Optimize query performance through data clustering, Z-ordering, file compaction, and partition pruning to match traditional data warehouse speed",
      "Enforce governance policies including row-level security, column masking, lineage tracking, and audit logging directly on lakehouse tables"
    ],
    "steps_zh": [
      "以开放文件格式（Parquet、ORC）将所有原始数据存储在云对象存储（S3、ADLS、GCS）上，保持数据湖的灵活性和成本效益",
      "使用Delta Lake、Apache Iceberg或Apache Hudi添加事务元数据层，为湖上数据提供ACID事务、模式强制和时间旅行",
      "实现统一目录（如Unity Catalog、AWS Glue、Hive Metastore），使BI工具和ML框架能发现和访问相同的治理数据集",
      "通过数据聚类、Z-ordering、文件压缩和分区裁剪优化查询性能，以匹配传统数据仓库速度",
      "在湖仓表上直接实施治理策略，包括行级安全、列掩码、血缘追踪和审计日志"
    ],
    "ai_relevant": true,
    "viz_type": "pyramid",
    "viz_labels": [
      "Raw Storage",
      "Table Format",
      "Unified Catalog",
      "Query Perf",
      "Governance"
    ],
    "viz_labels_zh": [
      "原始存储",
      "表格式",
      "统一目录",
      "查询优化",
      "数据治理"
    ],
    "related": [
      "star-schema",
      "data-mesh",
      "data-vault-2",
      "feature-store-pattern"
    ],
    "tags": [
      "lakehouse",
      "delta-lake",
      "iceberg",
      "unified-analytics",
      "data-warehouse"
    ],
    "origin_author": "Databricks (Armbrust, Ghodsi, Zaharia et al.), 2020",
    "origin_source": "Lakehouse: A New Generation of Open Platforms that Unify Data Warehousing and Advanced Analytics (CIDR 2021)",
    "origin_source_zh": "《湖仓：统一数据仓库与高级分析的新一代开放平台》（CIDR 2021）",
    "complexity": "intermediate",
    "when_to_use": [
      "When maintaining separate data lake and data warehouse systems creates data duplication and governance gaps",
      "When both SQL analytics and machine learning workloads need to operate on the same datasets",
      "When you need ACID transactions and schema enforcement on cloud object storage data",
      "When cost efficiency requires decoupling storage from compute while maintaining warehouse-grade query performance"
    ],
    "when_to_use_zh": [
      "当维护独立的数据湖和数据仓库系统造成数据重复和治理缺口时",
      "当SQL分析和机器学习工作负载需要在相同数据集上操作时",
      "当需要在云对象存储数据上实现ACID事务和模式强制时",
      "当成本效率要求存储与计算解耦同时保持仓库级查询性能时"
    ],
    "core_concepts": [
      "Open table formats: Delta Lake, Apache Iceberg, or Apache Hudi add a transactional metadata layer atop open file formats on object storage",
      "Storage-compute separation: data remains in cheap cloud object storage while multiple compute engines (Spark, Presto, Flink) query it independently",
      "ACID on the lake: transaction logs enable atomicity, consistency, isolation, and durability for data lake operations including concurrent writes",
      "Unified governance: a single catalog and permission model governs both analytical and ML access to the same physical data"
    ],
    "core_concepts_zh": [
      "开放表格式：Delta Lake、Apache Iceberg或Apache Hudi在对象存储的开放文件格式之上添加事务元数据层",
      "存储计算分离：数据保留在廉价的云对象存储中，多个计算引擎（Spark、Presto、Flink）独立查询",
      "湖上ACID：事务日志为数据湖操作提供原子性、一致性、隔离性和持久性，包括并发写入",
      "统一治理：单一目录和权限模型治理对相同物理数据的分析和ML访问"
    ],
    "timeline": [
      [
        "2017",
        "Databricks releases Delta Lake as an open-source transactional storage layer on Apache Spark"
      ],
      [
        "2020",
        "Databricks publishes the 'Lakehouse' vision paper unifying lake and warehouse paradigms"
      ],
      [
        "2021",
        "Apache Iceberg gains momentum as a vendor-neutral alternative; Netflix, Apple, and LinkedIn adopt it"
      ],
      [
        "2023",
        "Major cloud vendors offer managed lakehouse services; open table format wars drive rapid innovation"
      ]
    ],
    "timeline_zh": [
      [
        "2017",
        "Databricks在Apache Spark上发布开源事务存储层Delta Lake"
      ],
      [
        "2020",
        "Databricks发表「湖仓」愿景论文，统一数据湖和仓库范式"
      ],
      [
        "2021",
        "Apache Iceberg作为供应商中立替代方案获得动力；Netflix、Apple和LinkedIn采纳"
      ],
      [
        "2023",
        "主要云厂商提供托管湖仓服务；开放表格式竞争推动快速创新"
      ]
    ],
    "dos": [
      "Do choose an open table format (Iceberg, Delta, Hudi) to avoid vendor lock-in because the format is the foundation of your lakehouse",
      "Do implement a unified catalog so BI and ML workloads operate on the same governed data without ETL duplication",
      "Do tune file sizes and clustering strategies because small files on object storage severely degrade query performance",
      "Do enable time travel and versioning because it provides rollback capability and reproducible ML training datasets"
    ],
    "dos_zh": [
      "选择开放表格式（Iceberg、Delta、Hudi）以避免供应商锁定，因为格式是湖仓的基础",
      "实现统一目录，使BI和ML工作负载在相同治理数据上操作而无需ETL重复",
      "调优文件大小和聚类策略，因为对象存储上的小文件会严重降低查询性能",
      "启用时间旅行和版本管理，因为它提供回滚能力和可复现的ML训练数据集"
    ],
    "donts": [
      "Don't simply dump unmanaged files into a data lake and call it a lakehouse because the transactional metadata layer is what differentiates a lakehouse from a data swamp",
      "Don't ignore file compaction and maintenance operations because without them the lakehouse degrades into a slow, fragmented mess",
      "Don't assume all query engines perform equally on lakehouse tables because engine-format compatibility varies significantly",
      "Don't skip governance setup because a lakehouse without access controls inherits the worst aspects of ungoverned data lakes"
    ],
    "donts_zh": [
      "不要简单地将非托管文件倒入数据湖就称其为湖仓，因为事务元数据层才是湖仓与数据沼泽的区别",
      "不要忽视文件压缩和维护操作，否则湖仓会退化为缓慢且碎片化的混乱状态",
      "不要假设所有查询引擎在湖仓表上表现相同，因为引擎-格式兼容性差异显著",
      "不要跳过治理设置，因为没有访问控制的湖仓会继承不受治理的数据湖的最坏方面"
    ],
    "case_study_company": "Databricks / CERN",
    "case_study": "CERN adopted a Delta Lake-based lakehouse architecture to manage petabytes of particle physics data from the Large Hadron Collider. By unifying their data lake and analytical warehouse, physicists can run both SQL queries for reporting and Spark ML pipelines for anomaly detection on the same ACID-compliant datasets. This eliminated the need for costly data duplication between separate lake and warehouse systems.",
    "case_study_zh": "CERN采用基于Delta Lake的湖仓架构来管理来自大型强子对撞机的PB级粒子物理数据。通过统一数据湖和分析仓库，物理学家可以在相同的ACID兼容数据集上运行SQL查询进行报告和Spark ML管道进行异常检测。这消除了在独立湖和仓库系统之间进行昂贵数据复制的需要。",
    "when_not_to_use": [
      "When a traditional data warehouse meets all your needs and the added complexity of managing open table formats is not justified",
      "When your workloads are exclusively SQL analytics with no ML or unstructured data requirements",
      "When real-time sub-second query latency is critical and a purpose-built OLAP database outperforms lakehouse query engines",
      "When organizational maturity cannot support the operational overhead of managing object storage, table formats, and compute engines separately"
    ],
    "when_not_to_use_zh": [
      "当传统数据仓库满足所有需求且管理开放表格式的额外复杂性不合理时",
      "当工作负载完全是SQL分析，没有ML或非结构化数据需求时",
      "当需要亚秒级实时查询延迟且专用OLAP数据库优于湖仓查询引擎时",
      "当组织成熟度不足以支撑分别管理对象存储、表格式和计算引擎的运维开销时"
    ],
    "adopters": [
      "Databricks",
      "Netflix",
      "Apple",
      "CERN",
      "Comcast"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "scalability",
      "performance",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Armbrust, M., Ghodsi, A., Xin, R. & Zaharia, M. (2021). \"Lakehouse: A New Generation of Open Platforms that Unify Data Warehousing and Advanced Analytics\". Proceedings of CIDR.",
    "secondary_sources": [
      "Armbrust, M. et al. (2020). \"Delta Lake: High-Performance ACID Table Storage over Cloud Object Stores\". Proceedings of the VLDB Endowment, 13(12).",
      "Databricks (2020). \"What Is a Lakehouse?\". databricks.com."
    ],
    "typed_relations": [
      {
        "slug": "star-schema",
        "type": "complement"
      },
      {
        "slug": "data-mesh",
        "type": "complement"
      },
      {
        "slug": "data-vault-2",
        "type": "alternative"
      },
      {
        "slug": "feature-store-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 107,
    "name": "Star Schema",
    "name_zh": "星型模式",
    "slug": "star-schema",
    "category": "data",
    "desc": "Dimensional modeling with fact and dimension tables for analytics",
    "desc_zh": "以事实表和维度表进行维度建模的分析架构",
    "steps": [
      "Identify the business process to model and define the grain: the most atomic level of detail each fact table row represents",
      "Design the central fact table containing quantitative measures (revenue, quantity, duration) and foreign keys to all relevant dimensions",
      "Build dimension tables with descriptive attributes (customer name, product category, date hierarchy) that provide context for slicing and filtering facts",
      "Denormalize dimension tables into flat, wide structures to optimize join performance and simplify queries for analysts and BI tools",
      "Load the star schema using ETL/ELT pipelines with surrogate keys, slowly changing dimension handling, and incremental fact loading strategies"
    ],
    "steps_zh": [
      "识别要建模的业务过程并定义粒度：每行事实表代表的最原子级别的细节",
      "设计中心事实表，包含定量度量（收入、数量、持续时间）和所有相关维度的外键",
      "构建维度表，包含描述性属性（客户名、产品类别、日期层次），为事实的切片和过滤提供上下文",
      "将维度表反规范化为扁平宽结构，以优化连接性能并简化分析师和BI工具的查询",
      "使用ETL/ELT管道加载星型模式，包括代理键、缓慢变化维度处理和增量事实加载策略"
    ],
    "ai_relevant": false,
    "viz_type": "tree",
    "viz_labels": [
      "Fact Table",
      "Dimension",
      "Grain",
      "Surrogate Key",
      "Star Join"
    ],
    "viz_labels_zh": [
      "事实表",
      "维度表",
      "粒度",
      "代理键",
      "星型连接"
    ],
    "related": [
      "data-vault-2",
      "data-lakehouse",
      "polyglot-persistence"
    ],
    "tags": [
      "dimensional-modeling",
      "star-schema",
      "kimball",
      "analytics",
      "data-warehouse"
    ],
    "origin_author": "Ralph Kimball, 1996",
    "origin_source": "The Data Warehouse Toolkit: The Definitive Guide to Dimensional Modeling (Wiley, 1996; 3rd edition 2013)",
    "origin_source_zh": "《数据仓库工具箱：维度建模权威指南》（Wiley，1996；第三版2013）",
    "complexity": "intermediate",
    "when_to_use": [
      "When building a data warehouse or mart for business intelligence and ad-hoc analytical queries",
      "When query simplicity and performance for aggregate operations are more important than storage efficiency",
      "When BI tools like Tableau, Power BI, or Looker need intuitive, join-friendly schema structures",
      "When business users need self-service analytics with predictable, understandable data models"
    ],
    "when_to_use_zh": [
      "当为商业智能和临时分析查询构建数据仓库或数据集市时",
      "当聚合操作的查询简洁性和性能比存储效率更重要时",
      "当Tableau、Power BI或Looker等BI工具需要直观的、连接友好的模式结构时",
      "当业务用户需要使用可预测、可理解的数据模型进行自助分析时"
    ],
    "core_concepts": [
      "Fact tables: central tables containing quantitative measures (metrics) and foreign keys referencing dimension tables, representing business events at a specific grain",
      "Dimension tables: wide, denormalized tables containing descriptive attributes that provide the 'who, what, where, when, why' context for analyzing facts",
      "Grain declaration: explicitly defining what each row in a fact table represents to prevent ambiguity in aggregation and drill-down queries",
      "Slowly changing dimensions: strategies (Type 1, 2, 3) for tracking how dimension attributes change over time while preserving historical accuracy"
    ],
    "core_concepts_zh": [
      "事实表：包含定量度量（指标）和引用维度表外键的中心表，以特定粒度表示业务事件",
      "维度表：宽的反规范化表，包含描述性属性，为分析事实提供「谁、什么、哪里、何时、为什么」的上下文",
      "粒度声明：明确定义事实表中每行代表什么，以防止聚合和下钻查询中的歧义",
      "缓慢变化维度：追踪维度属性随时间变化的策略（类型1、2、3），同时保持历史准确性"
    ],
    "timeline": [
      [
        "1996",
        "Ralph Kimball publishes 'The Data Warehouse Toolkit' establishing dimensional modeling as the standard approach"
      ],
      [
        "2002",
        "Kimball and Ross release 'The Data Warehouse ETL Toolkit' formalizing star schema loading patterns"
      ],
      [
        "2013",
        "Third edition of 'The Data Warehouse Toolkit' updates techniques for modern columnar databases and cloud platforms"
      ],
      [
        "2020",
        "Star schema remains the dominant BI modeling pattern even as lakehouse architectures emerge"
      ]
    ],
    "timeline_zh": [
      [
        "1996",
        "Ralph Kimball出版《数据仓库工具箱》，确立维度建模为标准方法"
      ],
      [
        "2002",
        "Kimball和Ross发布《数据仓库ETL工具箱》，正式化星型模式加载模式"
      ],
      [
        "2013",
        "《数据仓库工具箱》第三版为现代列式数据库和云平台更新技术"
      ],
      [
        "2020",
        "即使湖仓架构兴起，星型模式仍是主导的BI建模模式"
      ]
    ],
    "dos": [
      "Do declare the grain of every fact table explicitly because ambiguous grain leads to incorrect aggregations and double-counting",
      "Do denormalize dimension tables fully because normalized dimensions (snowflake schema) add join complexity with minimal storage savings in analytical workloads",
      "Do use surrogate keys in dimension tables because natural keys change and cause referential integrity issues",
      "Do design conformed dimensions shared across multiple fact tables because they enable cross-process analysis and consistent reporting"
    ],
    "dos_zh": [
      "明确声明每个事实表的粒度，因为模糊的粒度会导致不正确的聚合和重复计数",
      "完全反规范化维度表，因为规范化维度（雪花模式）在分析工作负载中增加连接复杂性而存储节省极少",
      "在维度表中使用代理键，因为自然键会变化并导致引用完整性问题",
      "设计跨多个事实表共享的一致维度，因为它们支持跨过程分析和一致性报告"
    ],
    "donts": [
      "Don't normalize dimension tables into snowflake schema for analytical workloads because it forces multi-table joins that slow BI queries",
      "Don't mix grain levels in a single fact table because it makes aggregation unreliable and confuses business users",
      "Don't skip slowly changing dimension handling because losing historical dimension values makes historical analysis inaccurate",
      "Don't create operational (OLTP-style) schemas for analytical purposes because they are optimized for transactional writes, not analytical reads"
    ],
    "donts_zh": [
      "不要为分析工作负载将维度表规范化为雪花模式，因为这会强制多表连接导致BI查询变慢",
      "不要在单个事实表中混合粒度级别，因为这使聚合不可靠并困扰业务用户",
      "不要跳过缓慢变化维度处理，因为丢失历史维度值会使历史分析不准确",
      "不要为分析目的创建操作型（OLTP风格）模式，因为它们为事务写入优化而非分析读取"
    ],
    "case_study_company": "Walmart",
    "case_study": "Walmart's enterprise data warehouse uses Kimball-style star schemas to model its retail operations across 10,000+ stores. Fact tables capture point-of-sale transactions at the item-store-day grain, while conformed dimensions for product, store, and time enable cross-departmental analytics. This design allows supply chain, marketing, and finance teams to query the same data with consistent results using standard BI tools.",
    "case_study_zh": "Walmart的企业数据仓库使用Kimball风格的星型模式建模其跨10,000多家门店的零售运营。事实表以商品-门店-天粒度捕获销售点交易，而产品、门店和时间的一致维度支持跨部门分析。这种设计使供应链、营销和财务团队能够使用标准BI工具以一致的结果查询相同数据。",
    "when_not_to_use": [
      "When the use case is primarily OLTP with heavy insert/update workloads that require normalized schemas",
      "When data relationships are highly complex and graph-like rather than dimensional",
      "When schema flexibility is needed and the data structure changes frequently (consider schema-on-read approaches instead)",
      "When real-time streaming analytics are needed and pre-aggregated materialized views are more appropriate"
    ],
    "when_not_to_use_zh": [
      "当用例主要是OLTP且有大量插入/更新工作负载需要规范化模式时",
      "当数据关系高度复杂且呈图状而非维度化时",
      "当需要模式灵活性且数据结构频繁变化时（考虑读时模式方法）",
      "当需要实时流分析且预聚合物化视图更合适时"
    ],
    "adopters": [
      "Walmart",
      "Amazon",
      "Target",
      "Netflix",
      "Airbnb"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "performance",
      "usability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Kimball, R. & Ross, M. (2013). \"The Data Warehouse Toolkit: The Definitive Guide to Dimensional Modeling\", 3rd ed. Wiley.",
    "secondary_sources": [
      "Kimball, R. (1996). \"The Data Warehouse Toolkit: Practical Techniques for Building Dimensional Data Warehouses\". Wiley.",
      "Inmon, W.H. (2005). \"Building the Data Warehouse\", 4th ed. Wiley.",
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 3. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "data-vault-2",
        "type": "alternative"
      },
      {
        "slug": "data-lakehouse",
        "type": "related"
      },
      {
        "slug": "polyglot-persistence",
        "type": "related"
      }
    ]
  },
  {
    "id": 108,
    "name": "Data Vault 2.0",
    "name_zh": "Data Vault 2.0",
    "slug": "data-vault-2",
    "category": "data",
    "desc": "Agile, auditable data warehousing with hubs, links, and satellites",
    "desc_zh": "以Hub、Link和Satellite实现敏捷可审计的数据仓库方法论",
    "steps": [
      "Identify business keys from source systems and model them as Hub tables, each representing a core business concept (customer, product, order) with a hash key and load metadata",
      "Model relationships between hubs as Link tables that capture associations (customer-places-order) with their own hash keys and load metadata",
      "Attach descriptive and contextual attributes to hubs and links as Satellite tables, each timestamped to track the full history of changes",
      "Load the raw vault from source systems using pattern-based, parallelizable ETL that inserts without updating, preserving full auditability",
      "Build business vault and information marts on top of the raw vault to apply business rules and present data in star schema form for consumption"
    ],
    "steps_zh": [
      "从源系统识别业务键并建模为Hub表，每个代表一个核心业务概念（客户、产品、订单），包含哈希键和加载元数据",
      "将Hub之间的关系建模为Link表，捕获关联（客户-下-订单），有自己的哈希键和加载元数据",
      "将描述性和上下文属性作为Satellite表附加到Hub和Link上，每条带时间戳以追踪完整变更历史",
      "使用基于模式的、可并行化的ETL从源系统加载原始库，仅插入不更新，保留完整可审计性",
      "在原始库之上构建业务库和信息集市，应用业务规则并以星型模式形式呈现数据供消费"
    ],
    "ai_relevant": false,
    "viz_type": "tree",
    "viz_labels": [
      "Hub",
      "Link",
      "Satellite",
      "Raw Vault",
      "Info Mart"
    ],
    "viz_labels_zh": [
      "核心表",
      "关联表",
      "附属表",
      "原始库",
      "信息集市"
    ],
    "related": [
      "star-schema",
      "data-lakehouse",
      "data-mesh",
      "database-migration-patterns"
    ],
    "tags": [
      "data-vault",
      "agile-warehouse",
      "auditability",
      "hub-link-satellite"
    ],
    "origin_author": "Dan Linstedt, 2000 (formalized as 2.0 in 2013)",
    "origin_source": "Building a Scalable Data Warehouse with Data Vault 2.0 (Morgan Kaufmann, 2015)",
    "origin_source_zh": "《使用Data Vault 2.0构建可扩展数据仓库》（Morgan Kaufmann，2015）",
    "complexity": "advanced",
    "when_to_use": [
      "When multiple source systems with overlapping business keys need to be integrated into a single warehouse",
      "When full historical auditability and traceability of every data change is a regulatory or business requirement",
      "When the data warehouse must accommodate frequent source system changes without major rework",
      "When agile, incremental delivery of warehouse functionality is needed with parallel development across teams"
    ],
    "when_to_use_zh": [
      "当多个具有重叠业务键的源系统需要集成到单一仓库时",
      "当每个数据变更的完整历史可审计性和可追溯性是监管或业务要求时",
      "当数据仓库必须适应频繁的源系统变更而无需大规模返工时",
      "当需要敏捷、增量交付仓库功能且跨团队并行开发时"
    ],
    "core_concepts": [
      "Hubs: tables anchored on business keys representing core business entities, providing a stable integration point across source systems",
      "Links: tables capturing relationships between hubs, modeling the associations and transactions between business entities",
      "Satellites: tables containing descriptive attributes and context for hubs and links, timestamped to maintain full change history",
      "Insert-only loading: all data is appended, never updated or deleted in the raw vault, ensuring complete auditability and reproducibility"
    ],
    "core_concepts_zh": [
      "Hub：以业务键为锚点的表，代表核心业务实体，提供跨源系统的稳定集成点",
      "Link：捕获Hub之间关系的表，建模业务实体之间的关联和事务",
      "Satellite：包含Hub和Link描述性属性和上下文的表，带时间戳以维护完整变更历史",
      "仅插入加载：原始库中所有数据仅追加，不更新或删除，确保完整的可审计性和可重现性"
    ],
    "timeline": [
      [
        "2000",
        "Dan Linstedt introduces the original Data Vault modeling methodology"
      ],
      [
        "2013",
        "Data Vault 2.0 formalized with hash keys, pattern-based loading, and NoSQL support"
      ],
      [
        "2015",
        "Linstedt publishes 'Building a Scalable Data Warehouse with Data Vault 2.0' as the definitive guide"
      ],
      [
        "2020",
        "Data Vault 2.0 gains renewed interest as cloud data warehouses (Snowflake, BigQuery) make its patterns more performant"
      ]
    ],
    "timeline_zh": [
      [
        "2000",
        "Dan Linstedt引入最初的Data Vault建模方法论"
      ],
      [
        "2013",
        "Data Vault 2.0通过哈希键、基于模式的加载和NoSQL支持正式化"
      ],
      [
        "2015",
        "Linstedt出版《使用Data Vault 2.0构建可扩展数据仓库》作为权威指南"
      ],
      [
        "2020",
        "随着云数据仓库（Snowflake、BigQuery）提升其模式性能，Data Vault 2.0获得新的关注"
      ]
    ],
    "dos": [
      "Do use hash keys for hub and link primary keys because they enable parallel loading and deterministic key generation across distributed systems",
      "Do keep the raw vault free of business rules because it should be a pure, auditable reflection of source system data",
      "Do apply business rules in a separate business vault layer because mixing integration and interpretation creates fragile, hard-to-maintain models",
      "Do generate ETL code from metadata templates because the repetitive hub/link/satellite loading patterns are ideal for automation"
    ],
    "dos_zh": [
      "使用哈希键作为Hub和Link的主键，因为它们支持并行加载和跨分布式系统的确定性键生成",
      "保持原始库不含业务规则，因为它应该是源系统数据的纯净、可审计的反映",
      "在单独的业务库层应用业务规则，因为混合集成和解释会创建脆弱、难以维护的模型",
      "从元数据模板生成ETL代码，因为重复的Hub/Link/Satellite加载模式非常适合自动化"
    ],
    "donts": [
      "Don't expose raw vault tables directly to business users because the hub-link-satellite structure is not intuitive for analysis",
      "Don't update or delete records in the raw vault because it destroys the audit trail and historical traceability",
      "Don't skip the information mart layer because business users need star schema or flat table views for their BI tools",
      "Don't create point-to-point links between satellites because all relationships must flow through properly modeled link tables"
    ],
    "donts_zh": [
      "不要将原始库表直接暴露给业务用户，因为Hub-Link-Satellite结构对分析而言不直观",
      "不要更新或删除原始库中的记录，因为这会破坏审计追踪和历史可追溯性",
      "不要跳过信息集市层，因为业务用户需要星型模式或扁平表视图用于BI工具",
      "不要在Satellite之间创建点对点链接，因为所有关系必须通过正确建模的Link表流转"
    ],
    "case_study_company": "ING Bank",
    "case_study": "ING Bank adopted Data Vault 2.0 to integrate data from over 40 source systems across its global banking operations. The hub-link-satellite model allowed regulatory reporting teams to trace every data point back to its source system and exact load timestamp, meeting strict European banking audit requirements. Parallel loading patterns reduced their nightly ETL window from 12 hours to under 3 hours.",
    "case_study_zh": "ING银行采用Data Vault 2.0集成其全球银行业务中超过40个源系统的数据。Hub-Link-Satellite模型使监管报告团队能够将每个数据点追溯到其源系统和确切加载时间戳，满足严格的欧洲银行审计要求。并行加载模式将夜间ETL窗口从12小时缩短到不到3小时。",
    "when_not_to_use": [
      "When the organization has a single source system and integration across heterogeneous sources is not needed",
      "When the team lacks data modeling expertise and the learning curve of Data Vault 2.0 is too steep",
      "When real-time analytics are the primary use case and the multi-layer architecture adds unacceptable latency",
      "When data volumes are small and a simple star schema directly loaded from sources is sufficient"
    ],
    "when_not_to_use_zh": [
      "当组织只有单一源系统且不需要跨异构源集成时",
      "当团队缺乏数据建模专业知识且Data Vault 2.0学习曲线过陡时",
      "当实时分析是主要用例且多层架构增加不可接受的延迟时",
      "当数据量较小且从源直接加载的简单星型模式即可满足时"
    ],
    "adopters": [
      "ING Bank",
      "Pfizer",
      "USAA",
      "DirectTV",
      "Department of Defense (US)"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "scalability",
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Linstedt, D. & Olschimke, M. (2015). \"Building a Scalable Data Warehouse with Data Vault 2.0\". Morgan Kaufmann.",
    "secondary_sources": [
      "Linstedt, D. (2002). \"Data Vault Series\". The Data Administration Newsletter (TDAN.com).",
      "Linstedt, D. (2010). \"Super Charge Your Data Warehouse\". CreateSpace."
    ],
    "typed_relations": [
      {
        "slug": "star-schema",
        "type": "alternative"
      },
      {
        "slug": "data-lakehouse",
        "type": "complement"
      },
      {
        "slug": "data-mesh",
        "type": "complement"
      },
      {
        "slug": "database-migration-patterns",
        "type": "complement"
      }
    ]
  },
  {
    "id": 109,
    "name": "Polyglot Persistence",
    "name_zh": "多语言持久化",
    "slug": "polyglot-persistence",
    "category": "data",
    "desc": "Use purpose-fit databases for different data access patterns",
    "desc_zh": "为不同数据访问模式使用最合适的数据库技术",
    "steps": [
      "Analyze each service or bounded context to understand its data access patterns: read/write ratio, query complexity, consistency requirements, and data structure",
      "Select the optimal database technology for each pattern: relational for transactions, document stores for flexible schemas, graph databases for relationships, key-value for caching, time-series for metrics",
      "Define clear data ownership boundaries so each service owns its database and no external service directly accesses another's datastore",
      "Implement cross-service data synchronization using event-driven patterns (CDC, events, sagas) rather than shared databases or direct cross-service queries",
      "Establish operational standards for each database technology including backup, monitoring, scaling, and security procedures to manage the increased operational surface area"
    ],
    "steps_zh": [
      "分析每个服务或限界上下文以理解其数据访问模式：读写比、查询复杂性、一致性要求和数据结构",
      "为每种模式选择最佳数据库技术：关系型用于事务、文档存储用于灵活模式、图数据库用于关系、键值存储用于缓存、时序数据库用于指标",
      "定义清晰的数据所有权边界，每个服务拥有自己的数据库，外部服务不直接访问其数据存储",
      "使用事件驱动模式（CDC、事件、Saga）而非共享数据库或直接跨服务查询来实现跨服务数据同步",
      "为每种数据库技术建立运维标准，包括备份、监控、扩展和安全流程，以管理增加的运维面"
    ],
    "ai_relevant": true,
    "viz_type": "matrix",
    "viz_labels": [
      "Relational DB",
      "Document DB",
      "Graph DB",
      "Key-Value",
      "Time-Series"
    ],
    "viz_labels_zh": [
      "关系数据库",
      "文档数据库",
      "图数据库",
      "键值存储",
      "时序数据库"
    ],
    "related": [
      "microservices-decomposition",
      "data-mesh",
      "cap-theorem",
      "change-data-capture"
    ],
    "tags": [
      "polyglot",
      "database",
      "persistence",
      "microservices",
      "data-access"
    ],
    "origin_author": "Martin Fowler / Pramod Sadalage, 2011",
    "origin_source": "Polyglot Persistence (martinfowler.com blog, 2011); NoSQL Distilled (Addison-Wesley, 2012)",
    "origin_source_zh": "「多语言持久化」（martinfowler.com博客，2011）；《NoSQL精粹》（Addison-Wesley，2012）",
    "complexity": "intermediate",
    "when_to_use": [
      "When different parts of your system have fundamentally different data access patterns that no single database optimizes well",
      "When a microservices architecture requires each service to own its persistence independently",
      "When performance requirements demand specialized databases (e.g., graph for social networks, time-series for IoT metrics)",
      "When the one-size-fits-all relational database has become a bottleneck for specific workloads"
    ],
    "when_to_use_zh": [
      "当系统不同部分具有根本不同的数据访问模式，没有单一数据库能良好优化时",
      "当微服务架构要求每个服务独立拥有其持久化时",
      "当性能要求需要专用数据库（如社交网络的图数据库、IoT指标的时序数据库）时",
      "当一刀切的关系数据库已成为特定工作负载的瓶颈时"
    ],
    "core_concepts": [
      "Purpose-fit storage: selecting database technology based on the specific data model, query patterns, and consistency needs of each bounded context",
      "Database-per-service: each microservice owns and encapsulates its database, preventing tight coupling through shared data stores",
      "Data synchronization: using events, CDC, or sagas to keep data eventually consistent across services that use different database technologies",
      "Operational complexity trade-off: gaining performance and modeling advantages at the cost of managing multiple database technologies, backup strategies, and operational playbooks"
    ],
    "core_concepts_zh": [
      "适配存储：基于每个限界上下文的特定数据模型、查询模式和一致性需求选择数据库技术",
      "每服务一数据库：每个微服务拥有并封装其数据库，防止通过共享数据存储产生紧耦合",
      "数据同步：使用事件、CDC或Saga在使用不同数据库技术的服务之间保持最终一致性",
      "运维复杂性权衡：以管理多种数据库技术、备份策略和运维手册的成本换取性能和建模优势"
    ],
    "timeline": [
      [
        "2009",
        "NoSQL movement gains momentum with MongoDB, Cassandra, and Redis challenging relational database hegemony"
      ],
      [
        "2011",
        "Martin Fowler coins 'Polyglot Persistence' and publishes the influential blog post on choosing databases per use case"
      ],
      [
        "2012",
        "Fowler and Sadalage publish 'NoSQL Distilled' providing a systematic framework for database technology selection"
      ],
      [
        "2018",
        "Managed cloud database services (DynamoDB, CosmosDB, Cloud Spanner) lower the operational barrier to polyglot persistence"
      ]
    ],
    "timeline_zh": [
      [
        "2009",
        "NoSQL运动随MongoDB、Cassandra和Redis挑战关系数据库霸权而获得动力"
      ],
      [
        "2011",
        "Martin Fowler创造「多语言持久化」术语并发表按用例选择数据库的有影响力博客文章"
      ],
      [
        "2012",
        "Fowler和Sadalage出版《NoSQL精粹》，提供数据库技术选择的系统框架"
      ],
      [
        "2018",
        "托管云数据库服务（DynamoDB、CosmosDB、Cloud Spanner）降低了多语言持久化的运维门槛"
      ]
    ],
    "dos": [
      "Do choose databases based on data access patterns, not technology hype because the wrong database for a workload creates more problems than it solves",
      "Do keep database choices encapsulated within service boundaries because leaking database specifics creates tight coupling",
      "Do invest in team training for each database technology adopted because operational expertise is critical for production reliability",
      "Do start with fewer databases and add specialized ones only when measured performance demands it"
    ],
    "dos_zh": [
      "基于数据访问模式而非技术热潮选择数据库，因为不适合工作负载的数据库会制造更多问题",
      "将数据库选择封装在服务边界内，因为泄露数据库细节会产生紧耦合",
      "为每种采用的数据库技术投资团队培训，因为运维专业知识对生产可靠性至关重要",
      "从较少的数据库开始，仅在测量的性能需求要求时才添加专用数据库"
    ],
    "donts": [
      "Don't adopt a new database technology for every service because the operational overhead grows multiplicatively with each new technology",
      "Don't share databases between services to avoid the synchronization problem because shared databases create the tightest form of coupling",
      "Don't ignore the operational tax of running multiple database technologies because each needs its own monitoring, backup, and upgrade procedures",
      "Don't choose a NoSQL database just to be modern when a relational database meets your needs because unnecessary complexity is the enemy of reliability"
    ],
    "donts_zh": [
      "不要为每个服务采用新的数据库技术，因为运维开销随每种新技术成倍增长",
      "不要在服务间共享数据库以规避同步问题，因为共享数据库产生最紧密的耦合形式",
      "不要忽视运行多种数据库技术的运维税，因为每种都需要自己的监控、备份和升级流程",
      "当关系数据库满足需求时不要为了时髦而选择NoSQL，因为不必要的复杂性是可靠性的敌人"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix employs polyglot persistence across its microservices: Cassandra for distributed session and viewing history storage, MySQL for billing and account data requiring ACID transactions, Elasticsearch for content search, EVCache (memcached) for low-latency caching, and a proprietary time-series store for real-time streaming telemetry. Each database is chosen to optimize for the specific access pattern of its owning service.",
    "case_study_zh": "Netflix在其微服务中采用多语言持久化：Cassandra用于分布式会话和观看历史存储，MySQL用于需要ACID事务的计费和账户数据，Elasticsearch用于内容搜索，EVCache（memcached）用于低延迟缓存，以及专有时序存储用于实时流媒体遥测。每种数据库都针对其所属服务的特定访问模式进行优化。",
    "when_not_to_use": [
      "When a single database technology adequately serves all your data access patterns and adding complexity is not justified",
      "When the team is small and cannot afford the operational overhead of managing multiple database technologies",
      "When strong transactional consistency across all data is required and distributed transactions are too complex",
      "When regulatory requirements mandate a single auditable datastore for all organizational data"
    ],
    "when_not_to_use_zh": [
      "当单一数据库技术能够充分服务所有数据访问模式且增加复杂性不合理时",
      "当团队规模较小且无法承担管理多种数据库技术的运维开销时",
      "当需要跨所有数据的强事务一致性且分布式事务过于复杂时",
      "当监管要求为所有组织数据使用单一可审计的数据存储时"
    ],
    "adopters": [
      "Netflix",
      "Amazon",
      "Spotify",
      "LinkedIn",
      "eBay"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "performance",
      "scalability"
    ],
    "maturity_ring": "established",
    "primary_source": "Sadalage, P.J. & Fowler, M. (2012). \"NoSQL Distilled: A Brief Guide to the Emerging World of Polyglot Persistence\". Addison-Wesley.",
    "secondary_sources": [
      "Fowler, M. (2011). \"Polyglot Persistence\". martinfowler.com.",
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "microservices-decomposition",
        "type": "complement"
      },
      {
        "slug": "data-mesh",
        "type": "complement"
      },
      {
        "slug": "cap-theorem",
        "type": "complement"
      },
      {
        "slug": "change-data-capture",
        "type": "complement"
      }
    ]
  },
  {
    "id": 110,
    "name": "Feature Store Pattern",
    "name_zh": "特征存储模式",
    "slug": "feature-store-pattern",
    "category": "data",
    "desc": "Centralized ML feature management for training and serving",
    "desc_zh": "集中化ML特征管理，统一训练与在线服务",
    "steps": [
      "Define features as reusable, versioned entities with clear ownership, descriptions, data types, and lineage tracked in a feature registry",
      "Build feature pipelines that compute features from raw data sources using batch (Spark) or real-time (Flink, Kafka) transformations and write them to the feature store",
      "Materialize features into both an offline store (data warehouse or lake) for training and an online store (Redis, DynamoDB) for low-latency serving",
      "Ensure training-serving consistency by using the same feature definitions and transformation logic for both model training datasets and real-time inference requests",
      "Enable feature discovery, sharing, and monitoring by providing a catalog with search, usage metrics, data quality checks, and drift detection"
    ],
    "steps_zh": [
      "将特征定义为可复用、版本化的实体，在特征注册表中追踪清晰的所有权、描述、数据类型和血缘",
      "构建特征管道，使用批处理（Spark）或实时（Flink、Kafka）转换从原始数据源计算特征并写入特征存储",
      "将特征物化到离线存储（数据仓库或湖）用于训练和在线存储（Redis、DynamoDB）用于低延迟服务",
      "通过对模型训练数据集和实时推理请求使用相同的特征定义和转换逻辑来确保训练-服务一致性",
      "通过提供带搜索、使用指标、数据质量检查和漂移检测的目录来实现特征发现、共享和监控"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Feature Registry",
      "Pipeline",
      "Offline Store",
      "Online Store",
      "Monitoring"
    ],
    "viz_labels_zh": [
      "特征注册",
      "特征管道",
      "离线存储",
      "在线存储",
      "特征监控"
    ],
    "related": [
      "mlops",
      "data-lakehouse",
      "data-mesh",
      "stream-processing-patterns"
    ],
    "tags": [
      "feature-store",
      "ml-features",
      "training-serving",
      "mlops",
      "data-management"
    ],
    "origin_author": "Uber Michelangelo team (Hermann, Del Balso et al.), 2017",
    "origin_source": "Meet Michelangelo: Uber's Machine Learning Platform (Uber Engineering Blog, 2017)",
    "origin_source_zh": "「认识Michelangelo：Uber的机器学习平台」（Uber工程博客，2017）",
    "complexity": "intermediate",
    "when_to_use": [
      "When multiple ML models across teams share common features and redundant computation wastes resources",
      "When training-serving skew causes model accuracy degradation in production",
      "When feature engineering is the bottleneck in ML development and data scientists spend most of their time on data preparation",
      "When feature lineage, versioning, and quality monitoring are required for model governance and reproducibility"
    ],
    "when_to_use_zh": [
      "当多个团队的多个ML模型共享通用特征且冗余计算浪费资源时",
      "当训练-服务偏差导致生产中模型精度下降时",
      "当特征工程是ML开发的瓶颈且数据科学家大部分时间花在数据准备上时",
      "当模型治理和可重现性要求特征血缘、版本管理和质量监控时"
    ],
    "core_concepts": [
      "Feature registry: a centralized catalog where features are defined, versioned, documented, and discoverable across the organization",
      "Offline-online duality: the same features are materialized into an offline store for batch training and an online store for real-time serving with consistent semantics",
      "Training-serving consistency: identical feature transformation logic is used during training and inference, eliminating the training-serving skew that degrades model quality",
      "Point-in-time correctness: features for training datasets are computed as of the event timestamp to prevent data leakage and ensure temporal validity"
    ],
    "core_concepts_zh": [
      "特征注册表：集中化目录，在其中定义、版本化、记录特征并使其在组织范围内可发现",
      "离线-在线双重性：相同特征以一致语义物化到离线存储（用于批量训练）和在线存储（用于实时服务）",
      "训练-服务一致性：训练和推理使用相同的特征转换逻辑，消除降低模型质量的训练-服务偏差",
      "时间点正确性：训练数据集的特征按事件时间戳计算，防止数据泄露并确保时间有效性"
    ],
    "timeline": [
      [
        "2017",
        "Uber introduces Michelangelo with the first production-scale feature store for ML feature management"
      ],
      [
        "2019",
        "Feast (Google/Gojek) launches as the first open-source feature store; Hopsworks and Tecton follow"
      ],
      [
        "2021",
        "Feature stores become a standard component in ML platforms; AWS SageMaker and Databricks add feature store services"
      ],
      [
        "2023",
        "Feature stores evolve to support real-time features, streaming transformations, and LLM embedding management"
      ]
    ],
    "timeline_zh": [
      [
        "2017",
        "Uber推出Michelangelo，首个生产规模的ML特征管理特征存储"
      ],
      [
        "2019",
        "Feast（Google/Gojek）作为首个开源特征存储发布；Hopsworks和Tecton紧随其后"
      ],
      [
        "2021",
        "特征存储成为ML平台的标准组件；AWS SageMaker和Databricks添加特征存储服务"
      ],
      [
        "2023",
        "特征存储演进以支持实时特征、流式转换和LLM嵌入管理"
      ]
    ],
    "dos": [
      "Do enforce point-in-time correctness when generating training datasets because temporal data leakage silently corrupts model quality",
      "Do version features and track lineage because reproducibility of ML experiments depends on knowing exactly which feature definitions were used",
      "Do monitor feature data quality and distribution drift in production because degraded features silently degrade model predictions",
      "Do design features to be reusable across multiple models because the primary value of a feature store is shared computation"
    ],
    "dos_zh": [
      "生成训练数据集时强制时间点正确性，因为时间数据泄露会悄悄破坏模型质量",
      "对特征进行版本管理和血缘追踪，因为ML实验的可重现性依赖于确切知道使用了哪些特征定义",
      "在生产中监控特征数据质量和分布漂移，因为退化的特征会悄悄降低模型预测",
      "设计可跨多个模型复用的特征，因为特征存储的主要价值是共享计算"
    ],
    "donts": [
      "Don't let data scientists compute features in ad-hoc notebooks and deploy them to production without the feature store because it creates training-serving skew",
      "Don't skip the online store for real-time serving models because fetching features from the offline store at serving time introduces unacceptable latency",
      "Don't treat the feature store as a data warehouse replacement because it is specifically designed for ML feature lifecycle management",
      "Don't ignore feature freshness requirements because stale features in the online store produce predictions based on outdated information"
    ],
    "donts_zh": [
      "不要让数据科学家在临时notebook中计算特征并绕过特征存储部署到生产，因为这会产生训练-服务偏差",
      "不要为实时服务模型跳过在线存储，因为在服务时从离线存储获取特征会引入不可接受的延迟",
      "不要将特征存储视为数据仓库的替代品，因为它专门为ML特征生命周期管理而设计",
      "不要忽视特征时效性要求，因为在线存储中的过时特征会基于过时信息产生预测"
    ],
    "case_study_company": "Uber",
    "case_study": "Uber's Michelangelo platform introduced the feature store concept to solve the problem of thousands of ML models needing consistent access to shared features like trip distance, driver rating, and surge multiplier. By centralizing feature computation, Uber eliminated redundant feature pipelines across teams, ensured training-serving parity, and reduced the time to deploy a new ML model from months to weeks.",
    "case_study_zh": "Uber的Michelangelo平台引入特征存储概念，解决数千个ML模型需要一致访问共享特征（如行程距离、司机评分和动态倍数）的问题。通过集中化特征计算，Uber消除了跨团队的冗余特征管道，确保了训练-服务对等性，并将部署新ML模型的时间从数月缩短到数周。",
    "when_not_to_use": [
      "When you have only one or two ML models that do not share features and the overhead of a feature store is not justified",
      "When all features are simple direct database lookups with no transformation logic that could cause training-serving skew",
      "When the organization is in early ML adoption and simpler feature management through version-controlled scripts is sufficient",
      "When models do not require real-time serving and batch prediction with offline features meets all requirements"
    ],
    "when_not_to_use_zh": [
      "当只有一两个不共享特征的ML模型且特征存储的开销不合理时",
      "当所有特征都是简单的直接数据库查找，没有可能导致训练-服务偏差的转换逻辑时",
      "当组织处于ML早期采纳阶段且通过版本控制脚本的简单特征管理即可满足时",
      "当模型不需要实时服务且使用离线特征的批量预测满足所有要求时"
    ],
    "adopters": [
      "Uber",
      "Airbnb",
      "Spotify",
      "Doordash",
      "Gojek"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Hermann, J., Del Balso, M. et al. (2017). \"Meet Michelangelo: Uber's Machine Learning Platform\". Uber Engineering Blog.",
    "secondary_sources": [
      "Orr, L. et al. (2021). \"Managing ML Pipelines: Feature Stores, Model Stores, and the Coming Wave of Embedding Stores\". Proceedings of the VLDB Endowment, 14(12).",
      "Huyen, C. (2022). \"Designing Machine Learning Systems\", Ch. 7. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "mlops",
        "type": "complement"
      },
      {
        "slug": "data-lakehouse",
        "type": "complement"
      },
      {
        "slug": "data-mesh",
        "type": "complement"
      },
      {
        "slug": "stream-processing-patterns",
        "type": "complement"
      }
    ]
  },
  {
    "id": 195,
    "name": "Slowly Changing Dimensions (SCD)",
    "name_zh": "缓慢变化维度",
    "slug": "slowly-changing-dimensions",
    "category": "data",
    "desc": "Techniques for tracking historical changes in dimension tables (Kimball, 1996)",
    "desc_zh": "在维度表中追踪历史变更的技术体系（Kimball，1996）",
    "steps": [
      "Identify dimension attributes that change over time (e.g., customer address, product category) and classify each as Type 1, 2, or 3 based on historical tracking requirements",
      "For Type 2 SCD, add surrogate key, effective_from, effective_to, and is_current columns to the dimension table to represent each historical version as a distinct row",
      "Build ETL/ELT pipelines that detect source system changes via CDC or delta loads and apply the correct SCD strategy: overwrite for Type 1, insert new row for Type 2, add attribute column for Type 3",
      "Link fact tables to dimension surrogate keys so that historical facts always join to the dimension version that was current at the time of the event, preserving analytical accuracy",
      "Monitor SCD pipeline health by tracking dimension table growth rates, detecting unexpected attribute changes, and validating that no overlapping effective date ranges exist for Type 2 rows"
    ],
    "steps_zh": [
      "识别随时间变化的维度属性（如客户地址、产品类别），根据历史追踪需求将每个属性分类为类型1、2或3",
      "对于类型2 SCD，在维度表中添加代理键、effective_from、effective_to和is_current列，将每个历史版本表示为独立的行",
      "构建ETL/ELT管道，通过CDC或增量加载检测源系统变更，应用正确的SCD策略：类型1覆盖、类型2插入新行、类型3增加属性列",
      "将事实表关联到维度代理键，使历史事实始终关联到事件发生时当时有效的维度版本，保障分析准确性",
      "通过追踪维度表增长率、检测意外属性变更、验证类型2行的有效日期范围无重叠来监控SCD管道健康状况"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Type 1 Overwrite",
      "Type 2 History",
      "Type 3 Add Col",
      "Surrogate Key",
      "SCD Pipeline"
    ],
    "viz_labels_zh": [
      "类型1覆盖",
      "类型2历史",
      "类型3追加",
      "代理键",
      "SCD管道"
    ],
    "related": [
      "star-schema",
      "change-data-capture",
      "medallion-architecture"
    ],
    "tags": [
      "scd",
      "dimensional-modeling",
      "data-warehouse",
      "kimball",
      "history-tracking"
    ],
    "origin_author": "Ralph Kimball, 1996",
    "origin_source": "The Data Warehouse Toolkit: The Definitive Guide to Dimensional Modeling (Wiley, 1996, 3rd ed. 2013)",
    "origin_source_zh": "《数据仓库工具包：维度建模权威指南》（Wiley，1996，第3版2013）",
    "complexity": "intermediate",
    "when_to_use": [
      "When dimension attributes change over time and business users need to analyze historical facts against the dimension values that existed at the time of each event",
      "When regulatory or audit requirements mandate that historical states of master data (customer, product, employee) are preserved and queryable",
      "When a data warehouse must support 'as-was' reporting showing what a dimension looked like at any point in the past",
      "When source systems overwrite attribute values without preserving history and the warehouse must compensate for this loss"
    ],
    "when_to_use_zh": [
      "当维度属性随时间变化，且业务用户需要按事件发生时的维度值分析历史事实时",
      "当法规或审计要求要求保留并可查询主数据（客户、产品、员工）的历史状态时",
      "当数据仓库必须支持展示过去任意时刻维度状态的「历史如实」报告时",
      "当源系统覆盖属性值而不保留历史，数据仓库必须弥补这一信息损失时"
    ],
    "core_concepts": [
      "Type 1 SCD: overwrite the old value with the new value; no history is preserved; used for correcting errors or attributes where history is irrelevant",
      "Type 2 SCD: insert a new row with a new surrogate key for each change; full history is preserved via effective date ranges and is_current flags",
      "Type 3 SCD: add a new column to store the previous value alongside the current value; supports limited one-step history",
      "Surrogate key: a system-generated integer or hash key assigned to each dimension row, decoupling the warehouse model from source natural keys and enabling multiple historical versions"
    ],
    "core_concepts_zh": [
      "类型1 SCD：用新值覆盖旧值；不保留历史；用于纠正错误或历史无关紧要的属性",
      "类型2 SCD：每次变更插入带新代理键的新行；通过有效日期范围和is_current标志保留完整历史",
      "类型3 SCD：增加新列在当前值旁边存储上一个值；支持有限的单步历史",
      "代理键：分配给每个维度行的系统生成整数或哈希键，将仓库模型与源自然键解耦，支持多个历史版本"
    ],
    "timeline": [
      [
        "1996",
        "Ralph Kimball introduces the SCD concept in 'The Data Warehouse Toolkit', defining Type 1, 2, and 3 strategies"
      ],
      [
        "2002",
        "Kimball and Ross publish expanded SCD guidance including hybrid types (Type 4, 6) in 'The Data Warehouse Lifecycle Toolkit'"
      ],
      [
        "2013",
        "The 3rd edition of 'The Data Warehouse Toolkit' remains the definitive SCD reference, adding cloud data warehouse considerations"
      ],
      [
        "2020",
        "dbt introduces incremental models and snapshot functionality, making Type 2 SCD implementable in SQL-first ELT pipelines on cloud warehouses"
      ]
    ],
    "timeline_zh": [
      [
        "1996",
        "Ralph Kimball在《数据仓库工具包》中引入SCD概念，定义类型1、2和3策略"
      ],
      [
        "2002",
        "Kimball和Ross在《数据仓库生命周期工具包》中发布扩展SCD指南，包括混合类型（类型4、6）"
      ],
      [
        "2013",
        "《数据仓库工具包》第3版仍是SCD权威参考，新增云数据仓库注意事项"
      ],
      [
        "2020",
        "dbt引入增量模型和快照功能，使类型2 SCD可在云数据仓库上的SQL优先ELT管道中实现"
      ]
    ],
    "dos": [
      "Do choose the SCD type based on actual business reporting needs rather than technical convenience because the wrong type loses history that cannot be recovered",
      "Do use surrogate keys for all Type 2 dimensions because natural keys cannot uniquely identify historical versions of the same entity",
      "Do close expired Type 2 rows atomically with the insert of the new row to prevent gaps or overlaps in effective date ranges",
      "Do document the SCD type for each dimension attribute in a data dictionary because undocumented conventions become tribal knowledge that causes reporting errors"
    ],
    "dos_zh": [
      "根据实际业务报告需求而非技术便利性选择SCD类型，因为错误的类型会丢失无法恢复的历史",
      "对所有类型2维度使用代理键，因为自然键无法唯一标识同一实体的历史版本",
      "在插入新行时原子性地关闭过期的类型2行，以防止有效日期范围出现间隙或重叠",
      "在数据字典中记录每个维度属性的SCD类型，因为未记录的约定会成为部落知识并导致报告错误"
    ],
    "donts": [
      "Don't apply Type 2 SCD to all dimension attributes indiscriminately because dimension table explosion leads to query performance degradation and storage waste",
      "Don't forget to update fact table foreign keys when dimension rows are closed and new versions are inserted because stale keys break historical analysis",
      "Don't mix SCD logic with business transformation logic in the same ETL step because it makes pipelines harder to test and debug",
      "Don't ignore late-arriving dimension changes where the source change timestamp precedes already-loaded fact records because retroactive corrections require special handling"
    ],
    "donts_zh": [
      "不要不加区分地对所有维度属性应用类型2 SCD，因为维度表膨胀会导致查询性能下降和存储浪费",
      "不要忘记在维度行关闭并插入新版本时更新事实表外键，因为过时的键会破坏历史分析",
      "不要在同一ETL步骤中混合SCD逻辑和业务转换逻辑，因为这会使管道更难测试和调试",
      "不要忽视迟到的维度变更（源变更时间戳早于已加载的事实记录），因为追溯更正需要特殊处理"
    ],
    "case_study_company": "Airbnb",
    "case_study": "Airbnb's data warehouse uses Type 2 SCD on its host and listing dimensions to support accurate historical analysis of booking revenue. When a host changes their listing's price tier or location, a new dimension row is inserted with a new surrogate key, preserving the state that existed when each booking was made. This allows revenue reports to correctly attribute bookings to the pricing tier that was active at the time of the reservation rather than the current tier, which is essential for accurate year-over-year comparison and cohort analysis.",
    "case_study_zh": "Airbnb的数据仓库在其房东和房源维度上使用类型2 SCD，以支持准确的历史预订收入分析。当房东更改房源的价格层级或位置时，会插入一个带新代理键的新维度行，保留每次预订时的状态。这使收入报告能够将预订正确归因于预订时有效的价格层级，而非当前层级，这对准确的年同比比较和队列分析至关重要。",
    "when_not_to_use": [
      "Operational OLTP systems where historical dimension versioning adds unnecessary complexity and query overhead",
      "When the dimension attribute changes so frequently (e.g., real-time price) that Type 2 generates millions of rows and a separate temporal table or time-series store is more appropriate",
      "When no business user has expressed a need to analyze historical facts against past dimension states and the added complexity is pure overhead",
      "Highly agile early-stage data models where dimension schemas change frequently and SCD infrastructure would need constant rework"
    ],
    "when_not_to_use_zh": [
      "历史维度版本化会增加不必要的复杂性和查询开销的OLTP操作系统",
      "当维度属性变化过于频繁（如实时价格）导致类型2生成数百万行时，独立的时态表或时序存储更合适",
      "当没有业务用户表达需要按过去维度状态分析历史事实，增加的复杂性纯属开销时",
      "维度模式频繁变更的高度敏捷早期数据模型，SCD基础设施需要持续返工"
    ],
    "adopters": [
      "Airbnb",
      "Amazon",
      "Walmart",
      "Capital One",
      "Snowflake (dbt snapshots)"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Kimball, R. & Ross, M. (2013). \"The Data Warehouse Toolkit: The Definitive Guide to Dimensional Modeling\", 3rd ed. Wiley.",
    "secondary_sources": [
      "Kimball, R. (1996). \"Slowly Changing Dimensions\". DBMS Magazine.",
      "dbt Labs (2022). \"dbt Snapshots: Implementing Type 2 SCDs in SQL\". docs.getdbt.com.",
      "Inmon, W.H. (2005). \"Building the Data Warehouse\", 4th ed. Wiley."
    ],
    "typed_relations": [
      {
        "slug": "star-schema",
        "type": "complement"
      },
      {
        "slug": "change-data-capture",
        "type": "complement"
      },
      {
        "slug": "medallion-architecture",
        "type": "complement"
      }
    ]
  },
  {
    "id": 196,
    "name": "Data Lineage",
    "name_zh": "数据血缘",
    "slug": "data-lineage",
    "category": "data",
    "desc": "Tracking data origin, transformations, and consumption across pipelines",
    "desc_zh": "追踪数据在管道中的来源、转换过程和消费情况",
    "steps": [
      "Instrument data pipelines to capture lineage metadata: record source datasets, transformation logic, output datasets, execution timestamps, and job identifiers at each processing step",
      "Store lineage metadata in a dedicated lineage graph (directed acyclic graph) where nodes represent datasets or transformation steps and edges represent data flow relationships",
      "Integrate lineage capture with orchestration tools (Airflow, dbt, Spark) using native hooks or OpenLineage-compatible APIs to automate metadata collection without manual instrumentation",
      "Build a lineage UI or API that allows data consumers, data owners, and compliance teams to trace any dataset upstream to its origin and downstream to all its consumers",
      "Use lineage for impact analysis: when a source schema changes or a data quality issue is detected, automatically identify all downstream datasets, dashboards, and models that are affected"
    ],
    "steps_zh": [
      "对数据管道进行埋点以捕获血缘元数据：在每个处理步骤记录源数据集、转换逻辑、输出数据集、执行时间戳和作业标识符",
      "将血缘元数据存储在专用血缘图（有向无环图）中，其中节点代表数据集或转换步骤，边代表数据流关系",
      "使用原生钩子或兼容OpenLineage的API将血缘捕获与编排工具（Airflow、dbt、Spark）集成，实现自动化元数据采集",
      "构建血缘UI或API，允许数据消费者、数据所有者和合规团队将任何数据集追溯到其上游来源和所有下游消费者",
      "使用血缘进行影响分析：当源模式变更或检测到数据质量问题时，自动识别所有受影响的下游数据集、仪表板和模型"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Instrument",
      "Lineage Graph",
      "Auto-Capture",
      "Lineage UI",
      "Impact Analysis"
    ],
    "viz_labels_zh": [
      "埋点",
      "血缘图",
      "自动采集",
      "血缘界面",
      "影响分析"
    ],
    "related": [
      "data-mesh",
      "data-quality-framework",
      "schema-registry-pattern",
      "medallion-architecture"
    ],
    "tags": [
      "lineage",
      "metadata",
      "governance",
      "observability",
      "openlineage"
    ],
    "origin_author": "Industry-wide practice; standardized by OpenLineage (Astronomer/Linux Foundation, 2021)",
    "origin_source": "OpenLineage specification (openlineage.io, 2021); Apache Atlas documentation; Marquez project (WeWork, 2019)",
    "origin_source_zh": "OpenLineage规范（openlineage.io，2021）；Apache Atlas文档；Marquez项目（WeWork，2019）",
    "complexity": "intermediate",
    "when_to_use": [
      "When data quality issues in downstream reports need to be traced back to their root cause in source systems or intermediate transformations",
      "When regulatory compliance (GDPR, CCPA, BCBS 239) requires demonstrating where personal or financial data originates and how it is processed",
      "When schema changes in source systems need impact analysis to identify which downstream pipelines, models, and dashboards will break",
      "When a data mesh or data lakehouse architecture spans many teams and datasets, making manual tracking of data provenance infeasible"
    ],
    "when_to_use_zh": [
      "当下游报告中的数据质量问题需要追溯到源系统或中间转换的根本原因时",
      "当监管合规（GDPR、CCPA、BCBS 239）要求证明个人或财务数据的来源及其处理方式时",
      "当源系统模式变更需要影响分析以识别哪些下游管道、模型和仪表板将受影响时",
      "当数据网格或数据湖仓架构跨越许多团队和数据集，使手动追踪数据来源变得不可行时"
    ],
    "core_concepts": [
      "Column-level lineage: tracking data flow at the column granularity so that a single field in a report can be traced through all joins, aggregations, and transformations to the source column",
      "Operational lineage: metadata about pipeline runs including execution time, row counts, and job IDs captured alongside structural lineage",
      "OpenLineage: an open standard for lineage metadata collection providing a common API that tools like Airflow, Spark, and dbt emit events to",
      "Impact analysis: using the lineage graph in reverse (downstream traversal) to predict which datasets and consumers are affected by an upstream change"
    ],
    "core_concepts_zh": [
      "列级血缘：在列粒度追踪数据流，使报告中的单个字段可以通过所有连接、聚合和转换追溯到源列",
      "操作血缘：管道运行的元数据，包括执行时间、行数和作业ID，与结构性血缘一起捕获",
      "OpenLineage：血缘元数据采集的开放标准，提供Airflow、Spark和dbt等工具发送事件的通用API",
      "影响分析：反向使用血缘图（下游遍历）预测哪些数据集和消费者受上游变更影响"
    ],
    "timeline": [
      [
        "2012",
        "Apache Atlas (originally Falcon) introduced for Hadoop ecosystem metadata and lineage tracking at Hortonworks"
      ],
      [
        "2019",
        "WeWork open-sources Marquez, an early OpenLineage-compatible metadata server for Airflow pipelines"
      ],
      [
        "2021",
        "OpenLineage specification reaches v1.0 under Linux Foundation, standardizing lineage event schemas across tools"
      ],
      [
        "2023",
        "dbt, Apache Airflow 2.7, and Apache Spark natively emit OpenLineage events, making automated lineage mainstream"
      ]
    ],
    "timeline_zh": [
      [
        "2012",
        "Apache Atlas（最初为Falcon）在Hortonworks为Hadoop生态系统引入元数据和血缘追踪"
      ],
      [
        "2019",
        "WeWork开源Marquez，这是Airflow管道的早期OpenLineage兼容元数据服务器"
      ],
      [
        "2021",
        "OpenLineage规范在Linux基金会下达到v1.0，标准化跨工具的血缘事件模式"
      ],
      [
        "2023",
        "dbt、Apache Airflow 2.7和Apache Spark原生发送OpenLineage事件，使自动化血缘成为主流"
      ]
    ],
    "dos": [
      "Do capture lineage at the column level wherever possible because table-level lineage is insufficient for regulatory compliance and root cause analysis",
      "Do use OpenLineage-compatible tooling to avoid vendor lock-in and enable lineage aggregation across heterogeneous pipeline tools",
      "Do store lineage in a graph database or dedicated lineage store rather than a relational table because traversal queries are natural graph operations",
      "Do automate lineage capture through instrumentation rather than manual documentation because manual lineage goes stale immediately after the first pipeline change"
    ],
    "dos_zh": [
      "尽可能在列级别捕获血缘，因为表级血缘不足以满足监管合规和根本原因分析",
      "使用兼容OpenLineage的工具以避免供应商锁定，并在异构管道工具中聚合血缘",
      "将血缘存储在图数据库或专用血缘存储中，而非关系表中，因为遍历查询是天然的图操作",
      "通过埋点自动化血缘捕获，而非手动文档化，因为手动血缘在第一次管道变更后立即过时"
    ],
    "donts": [
      "Don't conflate data lineage with data catalog documentation because lineage is runtime-captured provenance, not manually authored descriptions",
      "Don't capture only successful pipeline runs because failed runs and partial writes can cause data quality issues that require lineage to diagnose",
      "Don't expose raw lineage graphs to business users without a purpose-built UI because raw DAG structures are incomprehensible to non-engineers",
      "Don't rely solely on static code analysis for lineage because dynamic SQL generation and runtime table routing cannot be resolved statically"
    ],
    "donts_zh": [
      "不要将数据血缘与数据目录文档混为一谈，因为血缘是运行时捕获的来源证明，而非手动编写的描述",
      "不要只捕获成功的管道运行，因为失败的运行和部分写入可能导致需要血缘才能诊断的数据质量问题",
      "不要在没有专用UI的情况下将原始血缘图暴露给业务用户，因为原始DAG结构对非工程师来说难以理解",
      "不要仅依赖静态代码分析来获取血缘，因为动态SQL生成和运行时表路由无法静态解析"
    ],
    "case_study_company": "ING Bank",
    "case_study": "ING Bank implemented column-level data lineage across its enterprise data platform to satisfy BCBS 239 regulatory requirements for risk data aggregation and reporting. By instrumenting Spark jobs and dbt models with OpenLineage, ING can trace any figure in a regulatory capital report back to its source transaction in the core banking system, through every join and aggregation, in under five minutes. This replaced a previously manual documentation process that took compliance teams weeks per report.",
    "case_study_zh": "ING银行在其企业数据平台中实施列级数据血缘，以满足BCBS 239风险数据汇总和报告的监管要求。通过使用OpenLineage对Spark作业和dbt模型进行埋点，ING可以在五分钟内将监管资本报告中的任何数字追溯到核心银行系统中的源交易，经过每一个连接和聚合。这取代了之前需要合规团队每份报告花费数周的手动文档化流程。",
    "when_not_to_use": [
      "Small single-pipeline projects where all transformations are visible in one place and lineage tracking adds overhead without benefit",
      "Pure real-time streaming pipelines where event-level lineage generates prohibitive metadata volume and sampling-based observability is sufficient",
      "When the organization has not yet established basic data catalog and data ownership practices, as lineage without governance context provides limited value",
      "Throwaway analytics scripts and ad-hoc queries where the cost of instrumenting lineage outweighs the benefit"
    ],
    "when_not_to_use_zh": [
      "所有转换都在一处可见的小型单管道项目，血缘追踪会增加开销而没有收益",
      "纯实时流管道，事件级血缘会产生难以承受的元数据量，基于采样的可观测性已足够",
      "当组织尚未建立基本的数据目录和数据所有权实践时，没有治理背景的血缘价值有限",
      "一次性分析脚本和临时查询，埋点血缘的成本超过收益"
    ],
    "adopters": [
      "ING Bank",
      "LinkedIn (DataHub)",
      "Netflix (Metacat)",
      "Airbnb (Dataportal)",
      "Lyft (Amundsen)"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "OpenLineage Project (2021). \"OpenLineage: An Open Standard for Data Lineage\". openlineage.io.",
    "secondary_sources": [
      "Deeleman, J. et al. (2022). \"Data Governance: The Definitive Guide\". O'Reilly Media.",
      "Spirent (2022). \"BCBS 239: Principles for Effective Risk Data Aggregation and Risk Reporting\". Basel Committee on Banking Supervision.",
      "Ehrlinger, L. & Woss, W. (2016). \"Towards a Definition of Knowledge Graphs\". Proceedings of the SEMANTICS 2016 Poster & Demo Track."
    ],
    "typed_relations": [
      {
        "slug": "data-mesh",
        "type": "complement"
      },
      {
        "slug": "data-quality-framework",
        "type": "complement"
      },
      {
        "slug": "schema-registry-pattern",
        "type": "complement"
      },
      {
        "slug": "medallion-architecture",
        "type": "complement"
      }
    ]
  },
  {
    "id": 197,
    "name": "Schema Registry Pattern",
    "name_zh": "模式注册表模式",
    "slug": "schema-registry-pattern",
    "category": "data",
    "desc": "Centralized schema management for data contracts (Confluent, 2015)",
    "desc_zh": "用于数据契约的集中化模式管理体系（Confluent，2015）",
    "steps": [
      "Deploy a schema registry service (Confluent Schema Registry, AWS Glue Schema Registry, or Apicurio) that acts as the centralized repository for all schema versions across your data platform",
      "Producers register schemas before writing data: the registry assigns a schema ID, validates compatibility with the existing version history, and rejects schemas that violate the configured compatibility mode",
      "Serializers embed the schema ID in the message payload (e.g., Avro magic byte + schema ID prefix) so consumers can resolve the exact schema version used to encode each message without out-of-band coordination",
      "Configure compatibility rules per subject: BACKWARD compatibility ensures new consumers can read old messages; FORWARD ensures old consumers can read new messages; FULL ensures both directions are safe",
      "Integrate schema validation into CI/CD pipelines so that schema changes are reviewed and compatibility-checked before deployment, treating schema changes as a first-class engineering artifact"
    ],
    "steps_zh": [
      "部署模式注册表服务（Confluent Schema Registry、AWS Glue Schema Registry或Apicurio），作为数据平台中所有模式版本的集中存储库",
      "生产者在写入数据前注册模式：注册表分配模式ID，验证与现有版本历史的兼容性，并拒绝违反配置兼容性模式的模式",
      "序列化器将模式ID嵌入消息负载（如Avro魔术字节+模式ID前缀），使消费者无需带外协调即可解析用于编码每条消息的确切模式版本",
      "按主题配置兼容性规则：BACKWARD兼容性确保新消费者可读取旧消息；FORWARD确保旧消费者可读取新消息；FULL确保双向安全",
      "将模式验证集成到CI/CD管道中，使模式变更在部署前经过审查和兼容性检查，将模式变更视为一等工程制品"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Producer",
      "Schema ID",
      "Compatibility",
      "Consumer",
      "CI Validation"
    ],
    "viz_labels_zh": [
      "生产者",
      "模式ID",
      "兼容性",
      "消费者",
      "CI验证"
    ],
    "related": [
      "change-data-capture",
      "data-lineage",
      "data-quality-framework",
      "stream-processing-patterns"
    ],
    "tags": [
      "schema-registry",
      "data-contracts",
      "avro",
      "protobuf",
      "compatibility"
    ],
    "origin_author": "Confluent (Jay Kreps et al.), 2015",
    "origin_source": "Confluent Schema Registry documentation (confluent.io, 2015); Designing Event-Driven Systems (O'Reilly, 2018) by Ben Stopford",
    "origin_source_zh": "Confluent模式注册表文档（confluent.io，2015）；Ben Stopford《事件驱动系统设计》（O'Reilly，2018）",
    "complexity": "intermediate",
    "when_to_use": [
      "When multiple producers and consumers share Kafka topics or event streams and schema evolution must not break existing consumers",
      "When data contracts between microservices need to be enforced automatically rather than relying on developer discipline",
      "When serialization formats like Avro or Protobuf are used and schema IDs need to be resolved at deserialization time",
      "When compliance or data governance requires an auditable history of all schema versions ever published to production"
    ],
    "when_to_use_zh": [
      "当多个生产者和消费者共享Kafka主题或事件流，模式演进不得破坏现有消费者时",
      "当微服务之间的数据契约需要自动执行而非依赖开发者纪律时",
      "当使用Avro或Protobuf等序列化格式且需要在反序列化时解析模式ID时",
      "当合规或数据治理要求对所有曾发布到生产的模式版本进行可审计的历史记录时"
    ],
    "core_concepts": [
      "Schema subject: a named scope (typically a Kafka topic name) under which multiple schema versions are registered and their compatibility enforced",
      "Compatibility modes: BACKWARD (new schema can read old data), FORWARD (old schema can read new data), FULL (both), NONE (no compatibility checking)",
      "Schema ID wire format: the compact binary prefix (magic byte + 4-byte schema ID) that Confluent serializers embed in messages, allowing consumers to fetch the schema from the registry at deserialization",
      "Data contract: a formal agreement between producer and consumer teams about the shape, semantics, and evolution rules of a dataset, enforced by the registry"
    ],
    "core_concepts_zh": [
      "模式主题：命名范围（通常是Kafka主题名称），在其下注册多个模式版本并执行兼容性",
      "兼容性模式：BACKWARD（新模式可读取旧数据）、FORWARD（旧模式可读取新数据）、FULL（两者）、NONE（不检查兼容性）",
      "模式ID线格式：Confluent序列化器嵌入消息中的紧凑二进制前缀（魔术字节+4字节模式ID），允许消费者在反序列化时从注册表获取模式",
      "数据契约：生产者和消费者团队之间关于数据集形状、语义和演进规则的正式协议，由注册表执行"
    ],
    "timeline": [
      [
        "2015",
        "Confluent open-sources the Schema Registry as part of the Confluent Platform, paired with Apache Kafka"
      ],
      [
        "2018",
        "Ben Stopford's 'Designing Event-Driven Systems' popularizes schema registry as a core data contract pattern"
      ],
      [
        "2021",
        "AWS launches Glue Schema Registry; Azure Event Hubs adds Schema Registry support, making the pattern cloud-native"
      ],
      [
        "2023",
        "Data contract movement formalizes schema registries as the enforcement mechanism for producer-consumer SLAs in data mesh architectures"
      ]
    ],
    "timeline_zh": [
      [
        "2015",
        "Confluent将模式注册表作为Confluent平台的一部分开源，与Apache Kafka配套"
      ],
      [
        "2018",
        "Ben Stopford的《事件驱动系统设计》将模式注册表推广为核心数据契约模式"
      ],
      [
        "2021",
        "AWS推出Glue Schema Registry；Azure Event Hubs添加模式注册表支持，使该模式成为云原生"
      ],
      [
        "2023",
        "数据契约运动将模式注册表正式化为数据网格架构中生产者-消费者SLA的执行机制"
      ]
    ],
    "dos": [
      "Do set BACKWARD or FULL compatibility as the default because it prevents producers from silently breaking consumers without coordination",
      "Do treat schema changes as part of the pull request and code review process because catching incompatibilities before deployment is far cheaper than rolling back production",
      "Do document the semantic meaning of each field in the schema alongside its type because a field named 'amount' without currency and unit context is an incomplete contract",
      "Do version the schema subject per Kafka topic and environment (dev, staging, prod) to prevent development schema pollution from breaking production consumers"
    ],
    "dos_zh": [
      "将BACKWARD或FULL兼容性设为默认值，因为它防止生产者在未协调的情况下静默地破坏消费者",
      "将模式变更作为拉取请求和代码审查过程的一部分，因为在部署前发现不兼容远比回滚生产便宜",
      "在模式中记录每个字段的语义含义以及其类型，因为没有货币和单位上下文的「amount」字段是不完整的契约",
      "按Kafka主题和环境（开发、预发、生产）对模式主题进行版本控制，防止开发模式污染破坏生产消费者"
    ],
    "donts": [
      "Don't use JSON Schema without a registry because without versioning and compatibility checks any producer can silently break all consumers",
      "Don't delete or modify registered schema versions because consumers may still rely on those versions to deserialize historical messages",
      "Don't let producers register schemas at runtime in production without CI/CD validation because unreviewed schema changes are a common source of production incidents",
      "Don't ignore the schema registry as an operational dependency because if it becomes unavailable all producers and consumers using it will fail to start or process messages"
    ],
    "donts_zh": [
      "不要在没有注册表的情况下使用JSON Schema，因为没有版本控制和兼容性检查，任何生产者都可能静默地破坏所有消费者",
      "不要删除或修改已注册的模式版本，因为消费者可能仍依赖这些版本来反序列化历史消息",
      "不要在生产中不经CI/CD验证就让生产者在运行时注册模式，因为未经审查的模式变更是生产事故的常见来源",
      "不要忽视模式注册表作为操作依赖，因为如果它不可用，所有使用它的生产者和消费者将无法启动或处理消息"
    ],
    "case_study_company": "Linkedin",
    "case_study": "LinkedIn uses a schema registry (built on their internal Espresso key-value store) to manage thousands of Avro schemas across over 10,000 Kafka topics that power their feed, notifications, and analytics pipelines. The registry enforces BACKWARD compatibility so that the 300+ microservices consuming any given topic can be deployed independently without coordination windows. Schema evolution incidents dropped by over 80% after the registry was mandated company-wide, compared to the prior JSON-without-schema era.",
    "case_study_zh": "LinkedIn使用模式注册表（基于其内部Espresso键值存储构建）管理驱动其动态、通知和分析管道的10,000多个Kafka主题中的数千个Avro模式。注册表执行BACKWARD兼容性，使消费任何给定主题的300多个微服务可以独立部署，无需协调窗口。与之前无模式JSON时代相比，模式演进事故在注册表全公司强制推行后下降了80%以上。",
    "when_not_to_use": [
      "Simple request-response REST APIs where OpenAPI specification and HTTP versioning provide sufficient contract management",
      "Single-team internal pipelines where producer and consumer are the same team and informal coordination is sufficient",
      "When message volumes are low and human-readable JSON without schema enforcement meets the team's agility needs",
      "Throwaway event streams used only for debugging or short-lived experiments where schema evolution is not a concern"
    ],
    "when_not_to_use_zh": [
      "OpenAPI规范和HTTP版本控制提供足够契约管理的简单请求-响应REST API",
      "生产者和消费者是同一团队且非正式协调已足够的单团队内部管道",
      "当消息量较低且没有模式强制的人类可读JSON满足团队敏捷性需求时",
      "仅用于调试或短期实验的一次性事件流，模式演进不是关注点"
    ],
    "adopters": [
      "LinkedIn",
      "Uber",
      "Cloudflare",
      "Robinhood",
      "Confluent customers (10,000+)"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Confluent Inc. (2015). \"Schema Registry Documentation\". docs.confluent.io.",
    "secondary_sources": [
      "Stopford, B. (2018). \"Designing Event-Driven Systems\". O'Reilly Media.",
      "Narkhede, N., Shapira, G. & Palino, T. (2017). \"Kafka: The Definitive Guide\". O'Reilly Media.",
      "Dehghani, Z. (2022). \"Data Contracts\". martinfowler.com."
    ],
    "typed_relations": [
      {
        "slug": "change-data-capture",
        "type": "complement"
      },
      {
        "slug": "data-lineage",
        "type": "complement"
      },
      {
        "slug": "data-quality-framework",
        "type": "complement"
      },
      {
        "slug": "stream-processing-patterns",
        "type": "complement"
      }
    ]
  },
  {
    "id": 198,
    "name": "Data Quality Framework",
    "name_zh": "数据质量框架",
    "slug": "data-quality-framework",
    "category": "data",
    "desc": "Systematic validation of data accuracy, completeness, and consistency",
    "desc_zh": "系统性验证数据准确性、完整性和一致性的方法体系",
    "steps": [
      "Define data quality dimensions relevant to the business context: accuracy, completeness, consistency, timeliness, uniqueness, and validity; assign ownership and SLOs for each critical dataset",
      "Instrument data pipelines with quality checks at ingestion, transformation, and serving layers using declarative frameworks (dbt tests, Great Expectations, Soda) that express expectations as code",
      "Publish quality metrics to a data observability platform with alerting so that data owners are notified of SLO breaches before downstream consumers are impacted",
      "Implement quarantine patterns: route records that fail quality checks to a quarantine table with failure annotations rather than silently dropping them, preserving auditability",
      "Close the feedback loop by routing quality alerts to data producers and establishing data quality SLAs in data contracts, making quality a shared responsibility between producers and consumers"
    ],
    "steps_zh": [
      "定义与业务背景相关的数据质量维度：准确性、完整性、一致性、及时性、唯一性和有效性；为每个关键数据集分配所有权和SLO",
      "使用声明式框架（dbt tests、Great Expectations、Soda）在摄入、转换和服务层对数据管道进行质量检查埋点，将期望表达为代码",
      "将质量指标发布到数据可观测性平台并设置告警，使数据所有者在下游消费者受影响前收到SLO违规通知",
      "实施隔离模式：将未通过质量检查的记录路由到带有失败注释的隔离表，而非静默丢弃，保留可审计性",
      "通过将质量告警路由给数据生产者并在数据契约中建立数据质量SLA来形成反馈闭环，使质量成为生产者和消费者的共同责任"
    ],
    "ai_relevant": true,
    "viz_type": "radar",
    "viz_labels": [
      "Accuracy",
      "Completeness",
      "Consistency",
      "Timeliness",
      "Validity"
    ],
    "viz_labels_zh": [
      "准确性",
      "完整性",
      "一致性",
      "时效性",
      "有效性"
    ],
    "related": [
      "data-lineage",
      "schema-registry-pattern",
      "data-mesh",
      "medallion-architecture"
    ],
    "tags": [
      "data-quality",
      "observability",
      "validation",
      "great-expectations",
      "dbt-tests"
    ],
    "origin_author": "Thomas C. Redman (1992); operationalized by DAMA International DMBOK",
    "origin_source": "DAMA International, \"DMBOK2: Data Management Body of Knowledge\" (2017); Great Expectations documentation (2019)",
    "origin_source_zh": "DAMA International《DMBOK2：数据管理知识体系》（2017）；Great Expectations文档（2019）",
    "complexity": "intermediate",
    "when_to_use": [
      "When downstream business decisions, ML models, or regulatory reports depend on data that could be corrupted by upstream system defects",
      "When data pipelines span multiple systems and teams, creating opportunities for data corruption at each handoff",
      "When SLA breaches or model accuracy degradation are caused by silent data quality issues that are detected only after business impact",
      "When data mesh or data product patterns require producers to publish quality guarantees to consumers as part of their data product contract"
    ],
    "when_to_use_zh": [
      "当下游业务决策、ML模型或监管报告依赖可能被上游系统缺陷破坏的数据时",
      "当数据管道跨越多个系统和团队，在每次交接处都有数据损坏的机会时",
      "当SLA违规或模型精度下降是由仅在业务影响后才发现的静默数据质量问题引起时",
      "当数据网格或数据产品模式要求生产者作为其数据产品契约的一部分向消费者发布质量保证时"
    ],
    "core_concepts": [
      "Data quality dimensions: the measurable characteristics of data — accuracy (correct values), completeness (no missing values), consistency (no contradictions), timeliness (freshness), uniqueness (no duplicates), validity (conforms to format/range rules)",
      "Expectations as code: expressing data quality rules as version-controlled, executable assertions that run in CI/CD and pipeline jobs rather than ad-hoc manual checks",
      "Data observability: continuous monitoring of data health metrics (freshness, volume, distribution, schema) with anomaly detection to catch quality issues before they propagate",
      "Shift-left quality: validating data quality at the earliest possible pipeline stage (ingestion) rather than at the consumption point to minimize the blast radius of quality failures"
    ],
    "core_concepts_zh": [
      "数据质量维度：数据的可测量特征——准确性（正确的值）、完整性（无缺失值）、一致性（无矛盾）、及时性（新鲜度）、唯一性（无重复）、有效性（符合格式/范围规则）",
      "代码化期望：将数据质量规则表达为版本控制的、可执行的断言，在CI/CD和管道作业中运行，而非临时手动检查",
      "数据可观测性：持续监控数据健康指标（新鲜度、数量、分布、模式），通过异常检测在质量问题传播前发现",
      "左移质量：在最早的管道阶段（摄入）而非消费点验证数据质量，以最小化质量失败的爆炸半径"
    ],
    "timeline": [
      [
        "1992",
        "Thomas Redman publishes 'Data Quality: Management and Technology', establishing data quality as a discipline"
      ],
      [
        "2017",
        "DAMA International releases DMBOK2, providing a comprehensive data management framework including quality dimensions"
      ],
      [
        "2019",
        "Great Expectations open-sourced, introducing 'expectations as code' and making programmatic data quality accessible to data engineers"
      ],
      [
        "2022",
        "Monte Carlo, Bigeye, and Soda raise significant funding, validating data observability as a distinct commercial category"
      ]
    ],
    "timeline_zh": [
      [
        "1992",
        "Thomas Redman出版《数据质量：管理与技术》，将数据质量确立为一门学科"
      ],
      [
        "2017",
        "DAMA International发布DMBOK2，提供包含质量维度的综合数据管理框架"
      ],
      [
        "2019",
        "Great Expectations开源，引入「代码化期望」，使数据工程师可以进行程序化数据质量管理"
      ],
      [
        "2022",
        "Monte Carlo、Bigeye和Soda获得大量融资，验证数据可观测性作为独立商业类别的价值"
      ]
    ],
    "dos": [
      "Do define quality rules in collaboration with data consumers because they know which quality failures have business impact and which are acceptable anomalies",
      "Do run quality checks in the pipeline, not just on a schedule, because schedule-based checks detect failures hours after the bad data has already propagated",
      "Do monitor data distributions and volumes in addition to schema checks because statistical anomalies (e.g., sudden 50% drop in row count) often signal upstream issues before column-level checks fire",
      "Do track quality metrics over time and build quality dashboards because point-in-time pass/fail is insufficient for understanding quality trends and SLO achievement"
    ],
    "dos_zh": [
      "与数据消费者协作定义质量规则，因为他们知道哪些质量失败有业务影响，哪些是可接受的异常",
      "在管道中而非仅按计划运行质量检查，因为基于计划的检查在坏数据已传播数小时后才发现失败",
      "除模式检查外还监控数据分布和数量，因为统计异常（如行数突然下降50%）通常在列级检查触发前就标志着上游问题",
      "随时间追踪质量指标并建立质量仪表板，因为时间点的通过/失败不足以理解质量趋势和SLO达成情况"
    ],
    "donts": [
      "Don't treat data quality as a one-time remediation project because data quality degrades continuously and requires ongoing monitoring",
      "Don't block all pipeline progress on quality failures because blocking every issue stops data delivery entirely; use severity tiers to quarantine critical failures and warn on minor ones",
      "Don't write quality checks only for known failure modes because unexpected quality issues are the most damaging; include statistical distribution monitoring for unknown unknowns",
      "Don't assign data quality responsibility solely to a central data team because data producers closest to the source are best positioned to understand and fix quality issues"
    ],
    "donts_zh": [
      "不要将数据质量视为一次性整治项目，因为数据质量会持续退化，需要持续监控",
      "不要在质量失败时阻止所有管道进度，因为阻止每个问题会完全停止数据交付；使用严重性级别隔离关键失败并对次要失败发出警告",
      "不要只为已知失败模式编写质量检查，因为意外的质量问题最具破坏性；包括统计分布监控以应对未知的未知",
      "不要将数据质量责任完全分配给中心化数据团队，因为最接近源头的数据生产者最有能力理解和修复质量问题"
    ],
    "case_study_company": "Intuit",
    "case_study": "Intuit built a company-wide data quality platform called DataQA that runs over 500,000 quality checks daily across their tax, payments, and small business data products. Using Great Expectations integrated into their Airflow pipelines, Intuit detects data quality issues before they reach the models powering TurboTax recommendations and QuickBooks insights. After implementing the framework, Intuit reduced data-quality-related model prediction errors by 35% and cut the mean time to detect a data quality incident from 4 hours to under 15 minutes.",
    "case_study_zh": "Intuit构建了名为DataQA的全公司数据质量平台，每天在其税务、支付和小企业数据产品中运行超过50万次质量检查。使用集成到Airflow管道中的Great Expectations，Intuit在数据质量问题到达驱动TurboTax推荐和QuickBooks洞察的模型之前检测到它们。实施该框架后，Intuit将数据质量相关的模型预测错误减少了35%，并将检测数据质量事件的平均时间从4小时缩短到15分钟以内。",
    "when_not_to_use": [
      "Exploratory data science environments where data is inherently messy and the cost of quality enforcement exceeds the benefit",
      "Real-time streaming pipelines where synchronous quality checks add latency that violates SLA requirements; use async anomaly detection instead",
      "When the organization has not yet established data ownership — quality frameworks without accountable owners produce alerts that no one acts on",
      "Throwaway prototypes and one-time data migrations where ongoing quality monitoring is unnecessary"
    ],
    "when_not_to_use_zh": [
      "数据本质上杂乱且质量执行成本超过收益的探索性数据科学环境",
      "同步质量检查增加违反SLA要求延迟的实时流管道；改用异步异常检测",
      "当组织尚未建立数据所有权时——没有负责任所有者的质量框架会产生无人处理的告警",
      "一次性原型和单次数据迁移，持续质量监控是不必要的"
    ],
    "adopters": [
      "Intuit",
      "Airbnb",
      "Lyft",
      "Shopify",
      "Etsy"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "DAMA International (2017). \"DMBOK2: Data Management Body of Knowledge\", 2nd ed. Technics Publications.",
    "secondary_sources": [
      "Redman, T.C. (1992). \"Data Quality: Management and Technology\". Bantam Books.",
      "Great Expectations (2019). \"Great Expectations Documentation\". greatexpectations.io.",
      "Barr Moses et al. (2022). \"Data Quality Fundamentals\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "data-lineage",
        "type": "complement"
      },
      {
        "slug": "schema-registry-pattern",
        "type": "complement"
      },
      {
        "slug": "data-mesh",
        "type": "complement"
      },
      {
        "slug": "medallion-architecture",
        "type": "complement"
      }
    ]
  },
  {
    "id": 199,
    "name": "Medallion Architecture",
    "name_zh": "奖章架构",
    "slug": "medallion-architecture",
    "category": "data",
    "desc": "Bronze/Silver/Gold layered data processing pattern (Databricks, 2021)",
    "desc_zh": "铜/银/金分层数据处理模式（Databricks，2021）",
    "steps": [
      "Ingest raw data from source systems into the Bronze layer without transformation, preserving the exact source payload including schema, data types, and any malformed records for full auditability",
      "Transform Bronze data into the Silver layer by applying cleaning, deduplication, schema normalization, and type casting; join related datasets and enrich records to produce a validated, integrated view",
      "Aggregate and model Silver data into the Gold layer, producing business-level tables, star schemas, or aggregated metrics optimized for BI tools, dashboards, and ML feature consumption",
      "Enforce data quality checks at each layer boundary: Bronze validates that data arrived; Silver validates schema and business rules; Gold validates business metrics and aggregation accuracy",
      "Implement incremental processing using Delta Lake or Iceberg MERGE operations so that each layer is updated efficiently without full re-scans, enabling near-real-time Gold layer freshness"
    ],
    "steps_zh": [
      "将来自源系统的原始数据无转换地摄入铜层，保留完整的源负载，包括模式、数据类型和任何格式错误的记录，以实现完整可审计性",
      "通过应用清洗、去重、模式规范化和类型转换将铜层数据转换为银层；连接相关数据集并丰富记录以产生经过验证的集成视图",
      "将银层数据聚合和建模为金层，产生为BI工具、仪表板和ML特征消费优化的业务级表、星型模式或聚合指标",
      "在每个层边界执行数据质量检查：铜层验证数据已到达；银层验证模式和业务规则；金层验证业务指标和聚合准确性",
      "使用Delta Lake或Iceberg MERGE操作实现增量处理，使每层能够高效更新而无需全量重扫，实现近实时金层新鲜度"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Bronze Layer",
      "Silver Layer",
      "Gold Layer",
      "Quality Gates",
      "Incremental"
    ],
    "viz_labels_zh": [
      "铜层",
      "银层",
      "金层",
      "质量门",
      "增量处理"
    ],
    "related": [
      "data-lakehouse",
      "data-quality-framework",
      "data-lineage",
      "slowly-changing-dimensions"
    ],
    "tags": [
      "medallion",
      "lakehouse",
      "delta-lake",
      "data-layers",
      "databricks"
    ],
    "origin_author": "Databricks (Reza Shiftehfar et al.), 2021",
    "origin_source": "Databricks Engineering Blog: 'Building a Medallion Architecture with Delta Lake' (2021); Delta Lake documentation",
    "origin_source_zh": "Databricks工程博客：「使用Delta Lake构建奖章架构」（2021）；Delta Lake文档",
    "complexity": "intermediate",
    "when_to_use": [
      "When building a data lakehouse on Delta Lake, Apache Iceberg, or Apache Hudi that needs clear separation between raw ingestion, integration, and serving layers",
      "When multiple downstream consumers (BI, ML, data science) have different quality and latency requirements and a single pipeline cannot serve all of them optimally",
      "When auditability requires preserving the original raw data alongside transformed versions for regulatory or debugging purposes",
      "When incremental data processing and data quality need to be enforced systematically across a multi-team data platform"
    ],
    "when_to_use_zh": [
      "当在Delta Lake、Apache Iceberg或Apache Hudi上构建需要清晰分离原始摄入、集成和服务层的数据湖仓时",
      "当多个下游消费者（BI、ML、数据科学）有不同的质量和延迟要求，单一管道无法最优服务所有人时",
      "当可审计性要求为监管或调试目的在转换版本旁边保留原始数据时",
      "当增量数据处理和数据质量需要在多团队数据平台上系统性执行时"
    ],
    "core_concepts": [
      "Bronze layer: the raw landing zone that stores data exactly as received from source systems; immutable append-only storage enabling full replay and re-processing",
      "Silver layer: the cleaned and integrated layer where business rules are applied, duplicates removed, and schemas normalized; serves as the single source of truth for analytical consumption",
      "Gold layer: the consumption-ready layer of aggregated, business-domain-specific datasets; optimized for query performance by BI tools and ML feature pipelines",
      "Delta Lake / table format: ACID-compliant table format (Delta, Iceberg, Hudi) that enables MERGE, schema evolution, and time travel across all three layers"
    ],
    "core_concepts_zh": [
      "铜层：原始登陆区，完全按从源系统接收的方式存储数据；不可变的追加式存储支持完整重放和重新处理",
      "银层：应用业务规则、删除重复项和规范化模式的清洗集成层；作为分析消费的单一真实数据源",
      "金层：消费就绪的聚合业务域特定数据集层；针对BI工具和ML特征管道的查询性能进行优化",
      "Delta Lake/表格式：支持所有三层MERGE、模式演进和时间旅行的符合ACID的表格式（Delta、Iceberg、Hudi）"
    ],
    "timeline": [
      [
        "2019",
        "Databricks releases Delta Lake as open source, providing the ACID foundation that makes the medallion pattern practical at scale"
      ],
      [
        "2021",
        "Databricks formalizes the 'Medallion Architecture' pattern in engineering blog posts and Spark Summit talks"
      ],
      [
        "2022",
        "Microsoft adopts medallion as the recommended architecture for Azure Data Lake and Synapse Analytics, accelerating enterprise adoption"
      ],
      [
        "2024",
        "Apache Iceberg achieves feature parity with Delta Lake; medallion architecture becomes table-format-agnostic, running on any open lakehouse format"
      ]
    ],
    "timeline_zh": [
      [
        "2019",
        "Databricks将Delta Lake作为开源发布，提供使奖章模式在规模上实用的ACID基础"
      ],
      [
        "2021",
        "Databricks在工程博客文章和Spark峰会演讲中正式化「奖章架构」模式"
      ],
      [
        "2022",
        "微软采用奖章架构作为Azure Data Lake和Synapse Analytics的推荐架构，加速企业采纳"
      ],
      [
        "2024",
        "Apache Iceberg实现与Delta Lake的功能对等；奖章架构变得与表格式无关，可在任何开放湖仓格式上运行"
      ]
    ],
    "dos": [
      "Do keep the Bronze layer truly raw and append-only because any transformation applied at ingestion time makes it impossible to re-derive the Silver layer from a different transformation logic later",
      "Do enforce explicit schema on Silver and Gold tables because schema-on-read in Silver causes unpredictable failures in downstream Gold pipelines",
      "Do partition Gold tables by the query predicates most common in BI tools (e.g., date, region) because partition pruning is the primary performance lever for large Gold tables",
      "Do use incremental MERGE rather than full refresh for Silver and Gold layers because full refreshes are slow, expensive, and create latency spikes for consumers"
    ],
    "dos_zh": [
      "保持铜层真正原始且只追加，因为在摄入时应用的任何转换都会使以后无法从不同的转换逻辑重新派生银层",
      "对银层和金层表强制执行显式模式，因为银层的读取时模式会在下游金层管道中造成不可预测的失败",
      "按BI工具中最常见的查询谓词（如日期、区域）对金层表进行分区，因为分区裁剪是大型金层表的主要性能杠杆",
      "对银层和金层使用增量MERGE而非全量刷新，因为全量刷新速度慢、成本高，会为消费者造成延迟尖峰"
    ],
    "donts": [
      "Don't skip the Bronze layer and write transformed data directly to Silver because without the raw layer you lose the ability to replay history after discovering a transformation bug",
      "Don't build Gold tables that replicate the entire Silver layer with minor variations because this creates maintenance sprawl; Gold should aggregate and simplify",
      "Don't mix streaming and batch processing in the same layer without explicit partition isolation because mixed write patterns cause Delta Lake file compaction issues",
      "Don't treat the medallion architecture as strictly three layers; hybrid Silver-Gold or additional domain layers are valid when business complexity justifies them"
    ],
    "donts_zh": [
      "不要跳过铜层直接将转换后的数据写入银层，因为没有原始层，在发现转换错误后就无法重放历史",
      "不要构建以细微变化复制整个银层的金层表，因为这会产生维护蔓延；金层应聚合和简化",
      "不要在没有显式分区隔离的情况下在同一层混合流处理和批处理，因为混合写入模式会导致Delta Lake文件压缩问题",
      "不要将奖章架构视为严格的三层；当业务复杂性证明合理时，混合银-金层或额外的领域层是有效的"
    ],
    "case_study_company": "Comcast",
    "case_study": "Comcast migrated its advertising analytics platform to a medallion architecture on Delta Lake, processing over 50 billion ad impression events per day. The Bronze layer captures raw impression, click, and conversion events from 200+ ad servers. The Silver layer joins these events with campaign metadata and applies identity resolution to deduplicate device IDs. The Gold layer produces advertiser-facing campaign performance reports and ML training datasets for bid optimization models. This architecture reduced ad reporting latency from 24 hours to under 2 hours while cutting data platform costs by 40% through Delta Lake's optimized file compaction.",
    "case_study_zh": "Comcast将其广告分析平台迁移到Delta Lake上的奖章架构，每天处理超过500亿个广告展示事件。铜层从200多个广告服务器捕获原始展示、点击和转化事件。银层将这些事件与活动元数据连接，并应用身份解析去重设备ID。金层产生面向广告主的活动绩效报告和用于出价优化模型的ML训练数据集。这种架构将广告报告延迟从24小时减少到2小时以内，同时通过Delta Lake的优化文件压缩将数据平台成本降低了40%。",
    "when_not_to_use": [
      "Simple single-source ETL pipelines where a two-layer approach (raw + curated) is sufficient and the three-layer overhead is not justified",
      "Real-time OLTP systems where sub-second query latency is required and the batch-oriented medallion processing model is incompatible with latency SLAs",
      "When the organization lacks the engineering maturity to maintain three layers of pipelines; start with a two-layer approach and evolve",
      "Small datasets that fit in a single database where the complexity of a distributed lakehouse is unnecessary overhead"
    ],
    "when_not_to_use_zh": [
      "两层方法（原始+精制）已足够且三层开销不合理的简单单源ETL管道",
      "需要亚秒查询延迟且面向批处理的奖章处理模型与延迟SLA不兼容的实时OLTP系统",
      "当组织缺乏维护三层管道的工程成熟度时；从两层方法开始并逐步演进",
      "适合单个数据库的小型数据集，分布式湖仓的复杂性是不必要的开销"
    ],
    "adopters": [
      "Databricks",
      "Comcast",
      "Shell",
      "H&M Group",
      "HSBC"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "scalability",
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Databricks Engineering (2021). \"Building a Medallion Architecture with Delta Lake\". databricks.com/blog.",
    "secondary_sources": [
      "Armbrust, M. et al. (2020). \"Delta Lake: High-Performance ACID Table Storage over Cloud Object Stores\". Proceedings of the VLDB Endowment, 13(12).",
      "Doan, A. et al. (2022). \"The Lakehouse: A New Generation of Open Platforms that Unify Data Warehousing and Advanced Analytics\". CIDR 2022.",
      "Databricks (2023). \"Delta Lake: The Definitive Guide\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "data-lakehouse",
        "type": "extends"
      },
      {
        "slug": "data-quality-framework",
        "type": "complement"
      },
      {
        "slug": "data-lineage",
        "type": "complement"
      },
      {
        "slug": "slowly-changing-dimensions",
        "type": "complement"
      }
    ]
  },
  {
    "id": 306,
    "name": "Data Catalog",
    "name_zh": "数据目录",
    "slug": "data-catalog",
    "category": "data",
    "desc": "Centralized metadata management system that enables data discovery, governance, and self-service analytics across an organization.",
    "desc_zh": "集中式元数据管理系统，支持组织内的数据发现、治理和自助分析。",
    "steps": [
      "Ingest metadata from all data sources (databases, data lakes, BI tools, pipelines) through automated crawlers and manual annotations",
      "Build a unified metadata store with technical metadata (schema, partitions, statistics), business metadata (owners, definitions, tags), and operational metadata (lineage, usage metrics)",
      "Enable semantic search and browse so data consumers can discover datasets by keyword, domain, owner, or data classification",
      "Implement governance workflows: ownership assignment, access requests, sensitivity tagging (PII, GDPR), and data quality certification",
      "Continuously update metadata via change-event subscriptions so the catalog reflects the real-time state of the data landscape"
    ],
    "steps_zh": [
      "通过自动爬虫和人工标注从所有数据源（数据库、数据湖、BI工具、管道）摄取元数据",
      "构建统一元数据存储，包含技术元数据（模式、分区、统计）、业务元数据（所有者、定义、标签）和运营元数据（血缘、使用指标）",
      "启用语义搜索和浏览，使数据消费者可以按关键词、领域、所有者或数据分类发现数据集",
      "实施治理工作流：所有权分配、访问申请、敏感度标记（个人信息、GDPR）和数据质量认证",
      "通过变更事件订阅持续更新元数据，使目录反映数据全景的实时状态"
    ],
    "ai_relevant": true,
    "viz_type": "tree",
    "viz_labels": [
      "Crawler",
      "Metadata Store",
      "Discovery",
      "Governance",
      "Change Sync"
    ],
    "viz_labels_zh": [
      "元数据爬取",
      "元数据存储",
      "数据发现",
      "数据治理",
      "变更同步"
    ],
    "related": [
      "data-lineage",
      "data-mesh",
      "data-quality-framework",
      "data-lineage-governance"
    ],
    "tags": [
      "metadata",
      "data-discovery",
      "governance",
      "data-catalog",
      "alation"
    ],
    "origin_author": "Alation",
    "origin_source": "Alation (2012). 'The Collaborative Data Catalog'. alation.com; O'Neil, C. & Schutt, R. (2013). Doing Data Science. O'Reilly.",
    "origin_source_zh": "Alation（2012）。「协作式数据目录」。alation.com；O'Neil, C. & Schutt, R.（2013）。《数据科学实战》。O'Reilly。",
    "complexity": "intermediate",
    "when_to_use": [
      "When an organization has dozens or hundreds of data sources and analysts spend significant time locating trustworthy datasets rather than analyzing them",
      "When data governance obligations (GDPR, CCPA, HIPAA) require documented data inventories, sensitivity classifications, and lineage trails",
      "When implementing a data mesh or data platform where domain teams need to publish, discover, and subscribe to each other's data products",
      "When onboarding new data engineers or analysts who need to understand what data exists, who owns it, and how it has been used"
    ],
    "when_to_use_zh": [
      "当组织拥有数十或数百个数据源，分析师花费大量时间定位可信数据集而非分析数据时",
      "当数据治理义务（GDPR、CCPA、HIPAA）要求有文档化的数据清单、敏感度分类和血缘记录时",
      "当实施数据网格或数据平台，领域团队需要发布、发现和订阅彼此的数据产品时",
      "当入职新数据工程师或分析师，需要了解存在哪些数据、谁拥有它以及如何使用时"
    ],
    "core_concepts": [
      "Technical metadata: schema definitions, column types, row counts, partition keys, and storage statistics automatically harvested from source systems",
      "Business metadata: human-authored glossary terms, dataset descriptions, ownership assignments, and domain classifications that give technical assets business context",
      "Data lineage integration: linking catalog entries to upstream sources and downstream consumers so impact analysis and root-cause investigation are possible",
      "Collaborative curation: crowdsourced ratings, usage-based popularity signals, and expert endorsements that surface high-quality datasets to new consumers"
    ],
    "core_concepts_zh": [
      "技术元数据：从源系统自动采集的模式定义、列类型、行数、分区键和存储统计",
      "业务元数据：人工编写的术语词汇表、数据集描述、所有权分配和领域分类，为技术资产提供业务背景",
      "数据血缘集成：将目录条目与上游源和下游消费者关联，支持影响分析和根本原因调查",
      "协作管理：众包评分、基于使用情况的热度信号和专家推荐，向新消费者展示高质量数据集"
    ],
    "timeline": [
      [
        "2012",
        "Alation founded and pioneers the modern collaborative data catalog concept, combining automated metadata harvesting with human curation"
      ],
      [
        "2015",
        "Waterline Data and Collibra enter the market, establishing data catalog as a recognized product category distinct from traditional metadata repositories"
      ],
      [
        "2019",
        "Gartner identifies data catalog as a critical capability in the modern data stack; AWS Glue Data Catalog and Azure Purview accelerate cloud adoption"
      ],
      [
        "2022",
        "Open Metadata and DataHub (LinkedIn) emerge as open-source alternatives; data catalog becomes standard infrastructure in data mesh architectures"
      ]
    ],
    "timeline_zh": [
      [
        "2012",
        "Alation成立，开创现代协作式数据目录概念，将自动元数据采集与人工管理相结合"
      ],
      [
        "2015",
        "Waterline Data和Collibra进入市场，将数据目录确立为有别于传统元数据仓库的公认产品类别"
      ],
      [
        "2019",
        "Gartner将数据目录确定为现代数据栈的关键能力；AWS Glue数据目录和Azure Purview加速云端采用"
      ],
      [
        "2022",
        "Open Metadata和DataHub（LinkedIn）作为开源替代方案出现；数据目录成为数据网格架构中的标准基础设施"
      ]
    ],
    "dos": [
      "Do automate metadata ingestion through crawlers and API integrations because manual-only catalogs quickly become stale and lose user trust",
      "Do assign clear dataset owners and make ownership visible in the catalog because undiscoverable ownership is the primary reason data governance programs fail",
      "Do integrate usage analytics (query frequency, most-accessed columns, last queried) so the catalog surfaces popular and recently validated datasets prominently",
      "Do tag datasets with sensitivity classifications at ingestion time because retrofitting PII tags across thousands of existing assets is prohibitively expensive"
    ],
    "dos_zh": [
      "通过爬虫和API集成自动化元数据摄取，因为仅靠人工维护的目录会迅速过时并失去用户信任",
      "分配清晰的数据集所有者并在目录中显示所有权，因为不可发现的所有权是数据治理计划失败的主要原因",
      "集成使用分析（查询频率、最常访问的列、最后查询时间），使目录突出显示热门和最近验证过的数据集",
      "在摄取时为数据集标记敏感度分类，因为在数千个现有资产中追溯个人信息标签代价极高"
    ],
    "donts": [
      "Don't deploy a data catalog as a purely IT-driven project without data consumer buy-in because adoption requires analysts and data scientists to actively contribute business metadata",
      "Don't treat the catalog as a static inventory; without continuous metadata refresh the catalog drifts from reality and users stop trusting it",
      "Don't ignore social features like comments, ratings, and endorsements because quantitative usage signals alone cannot capture domain-specific trust",
      "Don't attempt to catalog every field in every dataset on day one; start with the highest-traffic tables and expand coverage iteratively"
    ],
    "donts_zh": [
      "不要将数据目录作为纯IT驱动的项目部署，而不获得数据消费者认可，因为采用需要分析师和数据科学家积极贡献业务元数据",
      "不要将目录视为静态清单；没有持续元数据刷新，目录会偏离现实，用户将停止信任它",
      "不要忽视评论、评分和推荐等社交功能，因为单纯的定量使用信号无法捕捉特定领域的信任",
      "不要在第一天就尝试对每个数据集的每个字段进行编目；从流量最高的表开始，迭代扩展覆盖范围"
    ],
    "case_study_company": "LinkedIn",
    "case_study": "LinkedIn built DataHub, an open-source metadata platform that catalogs over 500,000 datasets, 300,000 pipelines, and 100,000 ML features across its data ecosystem. DataHub uses a push-based architecture where each data system (Kafka, Hadoop, Presto, Pinot) emits metadata change events to a central metadata graph. This graph-based approach enables real-time impact analysis: engineers can instantly see all downstream dashboards and ML models affected when a schema change is proposed. LinkedIn reports that DataHub reduced mean time to discover a trusted dataset from days to under 10 minutes.",
    "case_study_zh": "LinkedIn构建了DataHub，一个开源元数据平台，对其数据生态系统中超过50万个数据集、30万个管道和10万个ML特征进行编目。DataHub使用基于推送的架构，每个数据系统（Kafka、Hadoop、Presto、Pinot）将元数据变更事件发送到中央元数据图。这种基于图的方法支持实时影响分析：当提出模式变更时，工程师可以立即看到所有受影响的下游仪表板和ML模型。LinkedIn报告DataHub将发现可信数据集的平均时间从几天缩短到10分钟以内。",
    "when_not_to_use": [
      "Small teams with fewer than 10 data sources where informal documentation (a shared wiki or README files) provides sufficient discoverability without catalog overhead",
      "When data assets are entirely managed by a single team with no cross-team sharing; the catalog's collaboration features provide no value in a single-owner environment",
      "Highly regulated environments where metadata itself (column names, schema structure) is classified; a public-facing catalog may expose sensitive system architecture",
      "When the organization lacks the data stewardship culture to curate and maintain metadata; a poorly maintained catalog is worse than no catalog"
    ],
    "when_not_to_use_zh": [
      "数据源少于10个的小团队，非正式文档（共享wiki或README文件）在不增加目录开销的情况下提供了足够的可发现性",
      "当数据资产完全由单个团队管理，没有跨团队共享时；在单一所有者环境中，目录的协作功能没有价值",
      "元数据本身（列名、模式结构）被分类的高度监管环境；面向公众的目录可能暴露敏感系统架构",
      "当组织缺乏数据管理文化来管理和维护元数据时；维护不善的目录比没有目录更糟糕"
    ],
    "adopters": [
      "LinkedIn",
      "Airbnb",
      "Shopify",
      "ING Bank",
      "Lyft",
      "Netflix"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "observability",
      "security"
    ],
    "maturity_ring": "established",
    "primary_source": "Alation (2012). 'The Collaborative Data Catalog'. alation.com. Shi, M. et al. (2016). 'Goods: Organizing Google's Datasets'. SIGMOD 2016.",
    "secondary_sources": [
      "Gartner (2019). 'Magic Quadrant for Metadata Management Solutions'. Gartner Research.",
      "Shankar, S. et al. (2022). 'We're All in This Together: The Open Metadata Standard'. openmetadata.org.",
      "Chafi, H. et al. (2021). 'DataHub: A Generalized Metadata Search & Discovery Tool'. LinkedIn Engineering Blog."
    ],
    "typed_relations": [
      {
        "slug": "data-mesh",
        "type": "complement"
      },
      {
        "slug": "data-lineage",
        "type": "complement"
      },
      {
        "slug": "data-quality-framework",
        "type": "complement"
      },
      {
        "slug": "data-contract",
        "type": "prerequisite"
      }
    ]
  },
  {
    "id": 307,
    "name": "Schema Registry",
    "name_zh": "模式注册表",
    "slug": "schema-registry",
    "category": "data",
    "desc": "Centralized service for storing, versioning, and enforcing data schemas in streaming and event-driven architectures, ensuring producer-consumer compatibility.",
    "desc_zh": "集中式服务，用于在流式和事件驱动架构中存储、版本化和强制执行数据模式，确保生产者与消费者的兼容性。",
    "steps": [
      "Register canonical schemas (Avro, Protobuf, or JSON Schema) for each Kafka topic or event stream using a compatibility-checked API",
      "Configure compatibility rules (BACKWARD, FORWARD, FULL, or NONE) per subject so the registry auto-validates new schema versions at registration time",
      "Integrate schema serializers/deserializers (SerDes) in producers and consumers so they look up schemas by ID at runtime instead of embedding schema in every message",
      "Version and tag schemas alongside service releases so schema evolution is traceable and rollbacks are possible",
      "Monitor schema usage dashboards to identify stale schemas, detect consumers blocked by incompatible changes, and track adoption of new schema versions"
    ],
    "steps_zh": [
      "使用兼容性检查API为每个Kafka主题或事件流注册规范模式（Avro、Protobuf或JSON Schema）",
      "为每个主题配置兼容性规则（向后、向前、全量或无），使注册表在注册时自动验证新模式版本",
      "在生产者和消费者中集成模式序列化/反序列化器（SerDes），使其在运行时按ID查找模式，而非在每条消息中嵌入模式",
      "将模式与服务发布一起版本化和标记，使模式演进可追溯且回滚成为可能",
      "监控模式使用仪表板，识别过时模式、检测被不兼容变更阻塞的消费者，并跟踪新模式版本的采用情况"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Schema Register",
      "Compatibility",
      "SerDes",
      "Versioning",
      "Usage Monitor"
    ],
    "viz_labels_zh": [
      "模式注册",
      "兼容性",
      "序列化器",
      "版本管理",
      "使用监控"
    ],
    "related": [
      "schema-registry-pattern",
      "data-catalog",
      "data-lineage",
      "data-contract",
      "stream-processing-patterns"
    ],
    "tags": [
      "schema",
      "kafka",
      "avro",
      "compatibility",
      "confluent",
      "streaming"
    ],
    "origin_author": "Confluent",
    "origin_source": "Confluent (2015). 'Schema Registry Documentation'. docs.confluent.io; Narkhede, N. et al. (2017). Kafka: The Definitive Guide. O'Reilly.",
    "origin_source_zh": "Confluent（2015）。「模式注册表文档」。docs.confluent.io；Narkhede, N. 等（2017）。《Kafka权威指南》。O'Reilly。",
    "complexity": "intermediate",
    "when_to_use": [
      "When building Kafka-based event streaming pipelines where multiple producer and consumer teams evolve independently and schema changes must not break downstream consumers",
      "When regulatory or audit requirements mandate that the exact schema of every event published to a data stream is recorded, versioned, and retrievable",
      "When using Avro, Protobuf, or JSON Schema serialization formats that externalize schema from payload, making a central registry essential for deserialization",
      "When operating a data platform with dozens of topics and teams where ad-hoc schema changes without compatibility checks are a leading cause of production incidents"
    ],
    "when_to_use_zh": [
      "当构建基于Kafka的事件流管道，多个生产者和消费者团队独立演进，且模式变更不得破坏下游消费者时",
      "当监管或审计要求要求记录、版本化和检索发布到数据流的每个事件的确切模式时",
      "当使用Avro、Protobuf或JSON Schema序列化格式时，这些格式将模式与有效负载分离，使中央注册表成为反序列化的必要条件",
      "当在拥有数十个主题和团队的数据平台上运营，且未经兼容性检查的临时模式变更是生产事故的主要原因时"
    ],
    "core_concepts": [
      "Schema subjects: named containers (typically one per Kafka topic) that hold all versions of a schema and enforce a configured compatibility policy across those versions",
      "Compatibility modes: BACKWARD (new schema reads old data), FORWARD (old schema reads new data), FULL (both directions), allowing teams to choose the right trade-off between producer and consumer flexibility",
      "Schema ID wire format: producers embed only a 4-byte schema ID in each message header rather than the full schema, reducing payload size by 80–95% while enabling consumers to fetch the exact schema for deserialization",
      "SerDes integration: Confluent's Kafka serializers and deserializers query the registry at startup and cache schemas locally, making schema resolution transparent to application code"
    ],
    "core_concepts_zh": [
      "模式主题：命名容器（通常每个Kafka主题一个），保存模式的所有版本并在这些版本上强制执行配置的兼容性策略",
      "兼容性模式：向后（新模式读取旧数据）、向前（旧模式读取新数据）、全量（双向），允许团队在生产者和消费者灵活性之间选择合适的权衡",
      "模式ID线格式：生产者在每条消息头中仅嵌入4字节的模式ID而非完整模式，将有效负载大小减少80–95%，同时使消费者能够获取反序列化的确切模式",
      "SerDes集成：Confluent的Kafka序列化和反序列化器在启动时查询注册表并在本地缓存模式，使模式解析对应用代码透明"
    ],
    "timeline": [
      [
        "2015",
        "Confluent releases the Schema Registry as part of the Confluent Platform, formalizing centralized schema management for Apache Kafka ecosystems"
      ],
      [
        "2017",
        "Hortonworks Schema Registry (now Apache Atlas) adds support for Kafka Streams and Flink, extending the pattern beyond Kafka to the broader streaming ecosystem"
      ],
      [
        "2020",
        "AWS Glue Schema Registry and Azure Event Hubs Schema Registry launch, making schema governance a first-class cloud platform feature"
      ],
      [
        "2023",
        "OpenAPI-based AsyncAPI specification gains traction as a schema-registry-adjacent standard for documenting event-driven APIs across brokers"
      ]
    ],
    "timeline_zh": [
      [
        "2015",
        "Confluent作为Confluent平台的一部分发布模式注册表，为Apache Kafka生态系统正式化集中式模式管理"
      ],
      [
        "2017",
        "Hortonworks模式注册表（现为Apache Atlas）增加对Kafka Streams和Flink的支持，将该模式从Kafka扩展到更广泛的流式生态系统"
      ],
      [
        "2020",
        "AWS Glue模式注册表和Azure事件中心模式注册表推出，使模式治理成为一流云平台功能"
      ],
      [
        "2023",
        "基于OpenAPI的AsyncAPI规范作为跨消息代理文档化事件驱动API的模式注册表相邻标准获得广泛认可"
      ]
    ],
    "dos": [
      "Do set FULL compatibility on high-traffic topics shared across many consumer teams because it is the safest policy and prevents both producer and consumer from accidentally breaking each other",
      "Do version schemas alongside your service's release tags so you can correlate production incidents to specific schema changes in the git history",
      "Do run schema compatibility checks in CI before merging any producer code changes so incompatible schemas never reach the deployment pipeline",
      "Do monitor consumer lag per schema version to detect consumers pinned to old schema versions that will be affected by upcoming deprecations"
    ],
    "dos_zh": [
      "对在许多消费者团队之间共享的高流量主题设置全量兼容性，因为这是最安全的策略，防止生产者和消费者意外破坏对方",
      "将模式与服务的发布标签一起版本化，以便将生产事故与git历史中的特定模式变更关联",
      "在合并任何生产者代码变更之前在CI中运行模式兼容性检查，确保不兼容的模式不会进入部署管道",
      "监控每个模式版本的消费者延迟，以检测固定在旧模式版本的消费者，这些消费者将受到即将弃用的影响"
    ],
    "donts": [
      "Don't use NONE compatibility mode on shared topics because it removes all guardrails and allows any schema change regardless of consumer impact",
      "Don't hardcode schema IDs in consumer code because IDs are registry-specific and will differ across environments; always resolve IDs dynamically",
      "Don't delete schema versions that active consumers depend on; mark them as deprecated and give consumers a migration window before deletion",
      "Don't store large binary blobs or deeply nested schemas in the registry; extreme schema complexity should be a signal to decompose the event into smaller, focused schemas"
    ],
    "donts_zh": [
      "不要在共享主题上使用无兼容性模式，因为它去除所有防护措施，允许任何模式变更而不考虑对消费者的影响",
      "不要在消费者代码中硬编码模式ID，因为ID是特定于注册表的，在不同环境中会有所不同；始终动态解析ID",
      "不要删除活跃消费者依赖的模式版本；将其标记为已弃用，并在删除前给消费者留出迁移窗口",
      "不要在注册表中存储大型二进制blob或深度嵌套的模式；极端的模式复杂性应该是将事件分解为更小、更集中模式的信号"
    ],
    "case_study_company": "Zalando",
    "case_study": "Zalando, Europe's largest online fashion platform, enforces schema governance across 2,000+ Kafka topics using Confluent Schema Registry with FULL compatibility mode. Every event schema is authored in Avro and peer-reviewed via pull request before registration. The schema registry is integrated into Zalando's internal developer platform Nakadi, which exposes event types as REST APIs. When a producer team proposes an incompatible schema change, Nakadi's CI pipeline automatically identifies all consumer teams and opens compatibility-blocking pull requests to coordinate migration. This process has reduced schema-induced production incidents by 90% since 2018.",
    "case_study_zh": "欧洲最大的在线时尚平台Zalando使用Confluent模式注册表和全量兼容性模式对2000多个Kafka主题强制执行模式治理。每个事件模式在注册前以Avro编写并通过拉取请求进行同行评审。模式注册表集成到Zalando的内部开发者平台Nakadi中，该平台将事件类型作为REST API公开。当生产者团队提出不兼容的模式变更时，Nakadi的CI管道自动识别所有消费者团队并开启兼容性阻塞的拉取请求以协调迁移。自2018年以来，这一流程将模式引发的生产事故减少了90%。",
    "when_not_to_use": [
      "When using JSON or Avro with schemas embedded in each message payload and the team explicitly prefers schema-on-read flexibility over compatibility enforcement",
      "Small event-driven systems with fewer than 5 topics and a single team where the overhead of a separate registry service exceeds the benefit",
      "When all producers and consumers are tightly coupled in a monolith and deploy together; schema versioning adds process overhead without the decoupling benefit",
      "Prototyping phases where schema stability is not yet required and frequent structural changes are expected; introduce the registry when the schema stabilizes"
    ],
    "when_not_to_use_zh": [
      "当使用在每条消息有效负载中嵌入模式的JSON或Avro，且团队明确偏好读取时模式灵活性而非兼容性强制时",
      "拥有少于5个主题和单个团队的小型事件驱动系统，单独注册表服务的开销超过了收益",
      "当所有生产者和消费者在单体中紧密耦合并一起部署时；模式版本化增加了流程开销而没有解耦收益",
      "模式稳定性尚不需要且预期会有频繁结构变更的原型阶段；在模式稳定后引入注册表"
    ],
    "adopters": [
      "Confluent",
      "Zalando",
      "LinkedIn",
      "Uber",
      "Robinhood",
      "Goldman Sachs"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "maintainability",
      "scalability"
    ],
    "maturity_ring": "established",
    "primary_source": "Confluent (2015). 'Schema Registry Documentation'. docs.confluent.io/platform/current/schema-registry/index.html.",
    "secondary_sources": [
      "Narkhede, N., Shapira, G. & Palino, T. (2017). Kafka: The Definitive Guide. O'Reilly Media. Chapter 9: Building Data Pipelines.",
      "Kreps, J. (2014). 'The Log: What every software engineer should know about real-time data's unifying abstraction'. engineering.linkedin.com.",
      "AWS (2020). 'Getting Started with AWS Glue Schema Registry'. AWS Documentation."
    ],
    "typed_relations": [
      {
        "slug": "schema-registry-pattern",
        "type": "related"
      },
      {
        "slug": "data-contract",
        "type": "complement"
      },
      {
        "slug": "stream-processing-patterns",
        "type": "complement"
      },
      {
        "slug": "data-catalog",
        "type": "complement"
      }
    ]
  },
  {
    "id": 308,
    "name": "Data Lineage",
    "name_zh": "数据血缘",
    "slug": "data-lineage-governance",
    "category": "data",
    "desc": "Systematic tracking and visualization of data flow from origin through all transformations to final consumption, enabling impact analysis and regulatory compliance.",
    "desc_zh": "系统地追踪和可视化数据从起源经过所有转换到最终消费的流动，支持影响分析和法规合规。",
    "steps": [
      "Instrument data pipelines, ETL jobs, and query engines to emit lineage events (source, transformation, destination) at execution time using open standards like OpenLineage",
      "Collect and store lineage events in a graph database where nodes represent datasets or jobs and edges represent data flow or transformation relationships",
      "Build column-level lineage by parsing SQL and transformation code to trace individual field derivations through multi-hop pipelines",
      "Expose lineage graphs through a visualization layer that shows upstream dependencies and downstream impact for any selected dataset or column",
      "Integrate lineage with data catalog entries, data quality alerts, and incident management so root-cause analysis automatically surfaces the affected lineage path"
    ],
    "steps_zh": [
      "使用OpenLineage等开放标准对数据管道、ETL作业和查询引擎进行插桩，在执行时发出血缘事件（源、转换、目标）",
      "在图数据库中收集和存储血缘事件，其中节点代表数据集或作业，边代表数据流或转换关系",
      "通过解析SQL和转换代码构建列级血缘，以跟踪多跳管道中的各个字段派生",
      "通过可视化层展示血缘图，显示任何选定数据集或列的上游依赖和下游影响",
      "将血缘与数据目录条目、数据质量告警和事件管理集成，使根本原因分析自动显示受影响的血缘路径"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Lineage Events",
      "Graph Store",
      "Column Lineage",
      "Visualization",
      "Impact Alert"
    ],
    "viz_labels_zh": [
      "血缘事件",
      "图存储",
      "列级血缘",
      "可视化",
      "影响告警"
    ],
    "related": [
      "data-catalog",
      "data-quality-framework",
      "medallion-architecture",
      "data-contract"
    ],
    "tags": [
      "lineage",
      "apache-atlas",
      "openlineage",
      "governance",
      "impact-analysis",
      "data-quality"
    ],
    "origin_author": "Apache Atlas",
    "origin_source": "Apache Atlas Project (2015). 'Atlas: Data Governance and Metadata Framework for Hadoop'. atlas.apache.org; OpenLineage Specification (Astronomer, 2021).",
    "origin_source_zh": "Apache Atlas项目（2015）。「Atlas：Hadoop的数据治理和元数据框架」。atlas.apache.org；OpenLineage规范（Astronomer，2021）。",
    "complexity": "advanced",
    "when_to_use": [
      "When GDPR, CCPA, or BCBS 239 compliance requires demonstrating exactly which source systems contributed to a regulatory report and how the data was transformed",
      "When a data quality incident occurs and the team needs to rapidly identify all downstream dashboards, ML models, and reports consuming data from the corrupted source",
      "When planning schema changes or decommissioning a dataset and need to assess the full blast radius before executing the change",
      "When building a data mesh where domain teams need visibility into cross-domain data dependencies to avoid uncoordinated breaking changes"
    ],
    "when_to_use_zh": [
      "当GDPR、CCPA或BCBS 239合规要求证明哪些源系统为监管报告提供了数据以及数据如何被转换时",
      "当发生数据质量事件，团队需要快速识别所有下游仪表板、ML模型和从受损源消费数据的报告时",
      "当计划模式变更或下线数据集时，需要在执行变更前评估完整的影响范围",
      "当构建数据网格时，领域团队需要了解跨领域数据依赖关系以避免不协调的破坏性变更"
    ],
    "core_concepts": [
      "Dataset-level lineage: a directed acyclic graph (DAG) where nodes are datasets or tables and edges represent ETL jobs or SQL queries that produced the destination from its sources",
      "Column-level lineage: fine-grained tracking of individual column derivations through transformations, enabling precise impact analysis when a single upstream column changes",
      "OpenLineage: an open standard (Astronomer, 2021, now Linux Foundation) that defines a vendor-neutral event spec for emitting lineage from Airflow, Spark, dbt, Flink, and other engines",
      "Impact analysis vs. root-cause analysis: downstream traversal (what breaks if I change X) vs. upstream traversal (where did this bad data come from) — the two primary lineage use cases"
    ],
    "core_concepts_zh": [
      "数据集级血缘：有向无环图（DAG），其中节点是数据集或表，边代表从源生成目标的ETL作业或SQL查询",
      "列级血缘：通过转换对单个列派生的细粒度追踪，支持当单个上游列变更时进行精确影响分析",
      "OpenLineage：由Astronomer于2021年创建（现为Linux基金会）的开放标准，定义了用于从Airflow、Spark、dbt、Flink和其他引擎发出血缘的厂商中立事件规范",
      "影响分析与根本原因分析：下游遍历（如果我更改X会发生什么）与上游遍历（这些坏数据来自哪里）——两个主要的血缘用例"
    ],
    "timeline": [
      [
        "2015",
        "Apache Atlas released as part of the Hadoop ecosystem, providing the first open-source metadata and lineage framework for Hive, HBase, and HDFS"
      ],
      [
        "2018",
        "Google Cloud Data Catalog and Collibra launch automated lineage, moving data lineage from a manual documentation practice to an automated platform feature"
      ],
      [
        "2021",
        "OpenLineage specification published by Astronomer, establishing a vendor-neutral standard for lineage event emission across Airflow, Spark, and dbt"
      ],
      [
        "2023",
        "Column-level lineage becomes table stakes in enterprise data catalogs; dbt adds native column lineage support, extending graph-level lineage to granular field derivations"
      ]
    ],
    "timeline_zh": [
      [
        "2015",
        "Apache Atlas作为Hadoop生态系统的一部分发布，为Hive、HBase和HDFS提供第一个开源元数据和血缘框架"
      ],
      [
        "2018",
        "Google Cloud数据目录和Collibra推出自动化血缘，将数据血缘从手动文档实践转变为自动化平台功能"
      ],
      [
        "2021",
        "Astronomer发布OpenLineage规范，为Airflow、Spark和dbt的血缘事件发送建立厂商中立标准"
      ],
      [
        "2023",
        "列级血缘成为企业数据目录的标配；dbt增加原生列血缘支持，将图级血缘扩展到细粒度字段派生"
      ]
    ],
    "dos": [
      "Do adopt OpenLineage as your lineage emission standard from the start because vendor-specific lineage APIs create lock-in and prevent cross-tool lineage stitching",
      "Do capture column-level lineage from SQL engines and dbt models because dataset-level lineage alone cannot support regulatory column-to-report tracing requirements",
      "Do store lineage as a graph database (Neo4j, Amazon Neptune) rather than relational tables because multi-hop traversal queries are prohibitively expensive in SQL at scale",
      "Do integrate lineage alerts into your data quality monitoring so that when an upstream dataset fails validation, all downstream consumers receive automatic impact notifications"
    ],
    "dos_zh": [
      "从一开始就采用OpenLineage作为血缘发送标准，因为特定于供应商的血缘API会造成锁定并阻止跨工具血缘拼接",
      "从SQL引擎和dbt模型捕获列级血缘，因为仅靠数据集级血缘无法支持监管列到报告的追踪要求",
      "将血缘存储为图数据库（Neo4j、Amazon Neptune）而非关系表，因为多跳遍历查询在大规模SQL中代价极高",
      "将血缘告警集成到数据质量监控中，当上游数据集验证失败时，所有下游消费者会自动收到影响通知"
    ],
    "donts": [
      "Don't rely solely on static code parsing for lineage because dynamic SQL, stored procedures, and runtime-generated queries are invisible to static analysis",
      "Don't attempt to capture lineage retroactively for existing pipelines by reading logs; instrument pipelines going forward and accept a lineage coverage gap for legacy systems",
      "Don't expose raw lineage graphs to business users without abstraction because hundreds of nodes and edges overwhelm non-technical stakeholders; build purpose-specific views",
      "Don't conflate lineage with data catalog; lineage is a dynamic, runtime-derived graph while a catalog is a curated, human-enriched metadata store — they complement each other"
    ],
    "donts_zh": [
      "不要仅依赖静态代码解析进行血缘，因为动态SQL、存储过程和运行时生成的查询对静态分析不可见",
      "不要尝试通过读取日志为现有管道追溯捕获血缘；对管道进行前向插桩，并接受遗留系统的血缘覆盖空缺",
      "不要将原始血缘图直接暴露给业务用户而不加抽象，因为数百个节点和边会压倒非技术利益相关者；构建特定目的的视图",
      "不要将血缘与数据目录混淆；血缘是动态的运行时派生图，而目录是精心策划的人工丰富元数据存储——它们相互补充"
    ],
    "case_study_company": "ING Bank",
    "case_study": "ING Bank implemented end-to-end data lineage across 15,000 data assets to satisfy European Banking Authority (EBA) BCBS 239 data risk reporting requirements. Using Apache Atlas as the metadata backbone, ING mapped column-level lineage from operational databases through 300+ ETL pipelines to 120 regulatory reports. When the EBA requested proof of data origin for capital adequacy ratios, ING's lineage system produced a complete audit trail from source transaction records to the final reported figure in under 4 hours — a process that previously required 3 weeks of manual investigation. The lineage system now detects 85% of pipeline failures through downstream impact propagation before business users notice.",
    "case_study_zh": "ING银行在15000个数据资产中实施端到端数据血缘，以满足欧洲银行管理局（EBA）BCBS 239数据风险报告要求。使用Apache Atlas作为元数据骨干，ING通过300多个ETL管道将列级血缘从运营数据库映射到120个监管报告。当EBA要求提供资本充足率的数据来源证明时，ING的血缘系统在4小时内从源交易记录到最终报告数字生成了完整的审计追踪——这一过程以前需要3周的人工调查。血缘系统现在通过下游影响传播在业务用户注意到之前检测到85%的管道故障。",
    "when_not_to_use": [
      "Small teams with fewer than 20 pipelines where a simple data flow diagram maintained in Confluence provides sufficient lineage visibility without the engineering cost",
      "Real-time OLTP systems where data lineage tracking adds latency to transaction processing; use async audit logs instead",
      "When all data transformations happen inside a single SQL warehouse or dbt project; native lineage tools in those platforms are sufficient without a separate lineage service",
      "Early-stage startups where data infrastructure is volatile and lineage graphs would be outdated within weeks; invest in lineage when the data architecture stabilizes"
    ],
    "when_not_to_use_zh": [
      "管道少于20个的小团队，在Confluence中维护的简单数据流图在没有工程成本的情况下提供了足够的血缘可见性",
      "数据血缘追踪会给事务处理增加延迟的实时OLTP系统；改用异步审计日志",
      "当所有数据转换都在单个SQL数据仓库或dbt项目内发生时；这些平台中的原生血缘工具已足够，无需单独的血缘服务",
      "数据基础设施不稳定且血缘图会在数周内过时的早期创业公司；在数据架构稳定后再投资于血缘"
    ],
    "adopters": [
      "ING Bank",
      "Airbnb",
      "WeWork",
      "Spotify",
      "Danske Bank",
      "Astronomer"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "observability",
      "maintainability",
      "security"
    ],
    "maturity_ring": "established",
    "primary_source": "Apache Software Foundation (2015). 'Apache Atlas: Data Governance and Metadata Framework'. atlas.apache.org.",
    "secondary_sources": [
      "Turki, M. et al. (2021). 'OpenLineage: An Open Standard for Metadata and Lineage Collection'. Astronomer Blog / Linux Foundation.",
      "Collibra (2022). 'The State of Data Intelligence Report 2022'. collibra.com.",
      "Reis, J. & Housley, M. (2022). Fundamentals of Data Engineering. O'Reilly Media. Chapter 9: Serving Data for Analytics, Machine Learning, and Reverse ETL."
    ],
    "typed_relations": [
      {
        "slug": "data-catalog",
        "type": "complement"
      },
      {
        "slug": "data-quality-framework",
        "type": "complement"
      },
      {
        "slug": "medallion-architecture",
        "type": "complement"
      },
      {
        "slug": "data-contract",
        "type": "complement"
      }
    ]
  },
  {
    "id": 309,
    "name": "Feature Store",
    "name_zh": "特征仓库",
    "slug": "feature-store",
    "category": "data",
    "desc": "Centralized repository that manages the full lifecycle of ML features — from computation and storage to serving — enabling feature reuse, consistency between training and inference, and governance.",
    "desc_zh": "集中式存储库，管理ML特征的完整生命周期——从计算和存储到服务——支持特征复用、训练与推理之间的一致性以及治理。",
    "steps": [
      "Define feature transformations as versioned, reusable feature pipelines (using Spark, dbt, or Python) that compute features from raw data sources on a scheduled or streaming basis",
      "Store computed features in a dual-store architecture: an offline store (data warehouse or object store) for historical training data, and an online store (Redis, DynamoDB) for low-latency inference serving",
      "Register features in a feature registry with metadata (owner, data type, freshness SLA, entity key, computation logic) so data scientists can discover and reuse features without recomputing",
      "Enforce point-in-time correct feature retrieval for training datasets so features are fetched as they would have been known at the label timestamp, preventing training-serving skew",
      "Monitor feature freshness, drift, and usage metrics to detect stale features, model degradation signals, and unused features consuming compute budget"
    ],
    "steps_zh": [
      "将特征转换定义为版本化、可复用的特征管道（使用Spark、dbt或Python），按计划或流式方式从原始数据源计算特征",
      "将计算出的特征存储在双存储架构中：离线存储（数据仓库或对象存储）用于历史训练数据，在线存储（Redis、DynamoDB）用于低延迟推理服务",
      "在特征注册表中注册特征，包含元数据（所有者、数据类型、新鲜度SLA、实体键、计算逻辑），使数据科学家无需重新计算即可发现和复用特征",
      "对训练数据集强制执行时间点正确的特征检索，确保特征按标签时间戳时已知的状态获取，防止训练与服务偏差",
      "监控特征新鲜度、漂移和使用指标，以检测过时特征、模型退化信号和消耗计算预算的未使用特征"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Feature Pipeline",
      "Dual Store",
      "Feature Registry",
      "Point-in-Time",
      "Drift Monitor"
    ],
    "viz_labels_zh": [
      "特征管道",
      "双存储",
      "特征注册",
      "时间点回溯",
      "漂移监控"
    ],
    "related": [
      "feature-store-pattern",
      "data-catalog",
      "data-lineage-governance",
      "data-lakehouse",
      "data-mesh"
    ],
    "tags": [
      "feature-store",
      "machine-learning",
      "mlops",
      "uber",
      "michelangelo",
      "training-serving-skew"
    ],
    "origin_author": "Uber (Michelangelo)",
    "origin_source": "Hermann, J. & Del Balso, M. (2017). 'Meet Michelangelo: Uber's Machine Learning Platform'. Uber Engineering Blog. eng.uber.com.",
    "origin_source_zh": "Hermann, J. & Del Balso, M.（2017）。「认识Michelangelo：Uber的机器学习平台」。Uber工程博客。eng.uber.com。",
    "complexity": "advanced",
    "when_to_use": [
      "When multiple ML teams are independently computing the same features (e.g., user age, trip count, churn score) from the same raw data sources, creating duplicate computation and inconsistency",
      "When training-serving skew is a recurring source of model performance degradation because features are computed differently in the batch training pipeline and the real-time inference path",
      "When regulatory compliance (GDPR right-to-erasure, CCPA) requires point-in-time auditable records of which feature values were used to make a specific ML-driven decision",
      "When the organization operates more than 10 production ML models and the total cost of per-model feature engineering is significant enough to justify a shared platform"
    ],
    "when_to_use_zh": [
      "当多个ML团队从相同原始数据源独立计算相同特征（如用户年龄、行程数、流失评分），产生重复计算和不一致性时",
      "当训练与服务偏差是模型性能退化的反复来源，因为特征在批量训练管道和实时推理路径中计算方式不同时",
      "当监管合规（GDPR删除权、CCPA）要求对特定ML驱动决策使用的特征值进行时间点可审计记录时",
      "当组织运营超过10个生产ML模型，且每个模型特征工程的总成本足以证明共享平台的合理性时"
    ],
    "core_concepts": [
      "Offline store: the historical feature store backed by a column-oriented data warehouse (BigQuery, Snowflake) or object store (S3 + Parquet) used to generate training datasets at any historical point in time",
      "Online store: a low-latency key-value store (Redis, DynamoDB, Bigtable) that holds the latest feature values for each entity (user ID, item ID) and serves features to real-time inference engines at p99 < 10ms",
      "Training-serving skew: the critical problem that arises when feature computation logic differs between training time and inference time, causing the model to see different feature distributions in production than it was trained on",
      "Point-in-time correct retrieval: the mechanism of joining feature values to training labels using only feature values that existed at or before the label timestamp, preventing data leakage and future information contamination"
    ],
    "core_concepts_zh": [
      "离线存储：由列式数据仓库（BigQuery、Snowflake）或对象存储（S3 + Parquet）支持的历史特征存储，用于在任意历史时间点生成训练数据集",
      "在线存储：低延迟键值存储（Redis、DynamoDB、Bigtable），保存每个实体（用户ID、商品ID）的最新特征值，以p99 < 10ms的速度为实时推理引擎提供特征",
      "训练与服务偏差：当特征计算逻辑在训练时和推理时不同时出现的关键问题，导致模型在生产环境中看到与训练时不同的特征分布",
      "时间点正确检索：使用标签时间戳时或之前存在的特征值将特征值与训练标签连接的机制，防止数据泄漏和未来信息污染"
    ],
    "timeline": [
      [
        "2017",
        "Uber publishes Michelangelo, the first widely-documented feature store, demonstrating how a centralized platform eliminates duplicate feature engineering across 200+ ML models"
      ],
      [
        "2019",
        "Feast (Feature Store for ML) released as open source by GoJek and Google, providing the first vendor-neutral feature store implementation"
      ],
      [
        "2021",
        "Tecton, Hopsworks, and SageMaker Feature Store launch as managed feature store services; feature stores become a recognized MLOps platform category"
      ],
      [
        "2023",
        "Databricks Feature Store integrated into Unity Catalog; feature stores converge with data catalogs as organizations seek unified governance across analytics and ML assets"
      ]
    ],
    "timeline_zh": [
      [
        "2017",
        "Uber发布Michelangelo，第一个有广泛文档记录的特征仓库，展示集中式平台如何消除200多个ML模型的重复特征工程"
      ],
      [
        "2019",
        "GoJek和Google以开源方式发布Feast（ML特征仓库），提供第一个厂商中立的特征仓库实现"
      ],
      [
        "2021",
        "Tecton、Hopsworks和SageMaker特征仓库作为托管特征仓库服务推出；特征仓库成为公认的MLOps平台类别"
      ],
      [
        "2023",
        "Databricks特征仓库集成到Unity Catalog；随着组织寻求跨分析和ML资产的统一治理，特征仓库与数据目录逐渐融合"
      ]
    ],
    "dos": [
      "Do separate the offline and online store architectures from day one because conflating historical training storage with real-time serving storage leads to unacceptable latency trade-offs",
      "Do enforce point-in-time correct feature retrieval in all training dataset generation pipelines because even a single data leakage incident can invalidate months of model evaluation metrics",
      "Do include feature freshness SLAs in the registry and alert when features become stale because a stale feature at inference time is equivalent to missing data from the model's perspective",
      "Do version feature definitions alongside model versions in the model registry so that model rollbacks automatically revert to the correct feature computation logic"
    ],
    "dos_zh": [
      "从第一天起就分离离线和在线存储架构，因为将历史训练存储与实时服务存储混淆会导致不可接受的延迟权衡",
      "在所有训练数据集生成管道中强制执行时间点正确的特征检索，因为即使一次数据泄漏事件也可能使数月的模型评估指标失效",
      "在注册表中包含特征新鲜度SLA并在特征过时时告警，因为推理时的过时特征相当于模型视角的缺失数据",
      "将特征定义与模型注册表中的模型版本一起版本化，使模型回滚自动恢复到正确的特征计算逻辑"
    ],
    "donts": [
      "Don't share the same feature pipeline for both training and real-time inference without explicit validation that the logic is identical; code duplication is safer than silent divergence",
      "Don't store raw events in the feature store; the feature store should contain pre-aggregated features, not raw data — raw event storage belongs in the data lake",
      "Don't allow data scientists to create ephemeral, undocumented feature transformations outside the feature store; all production features must be registered, tested, and governed",
      "Don't deploy a feature store before you have at least two models that could share features; the platform overhead is not justified for a single model"
    ],
    "donts_zh": [
      "不要在没有明确验证逻辑相同的情况下为训练和实时推理共享相同的特征管道；代码重复比无声的发散更安全",
      "不要在特征仓库中存储原始事件；特征仓库应包含预聚合的特征，而非原始数据——原始事件存储属于数据湖",
      "不要允许数据科学家在特征仓库外创建临时的、未记录的特征转换；所有生产特征必须注册、测试和治理",
      "在至少有两个可以共享特征的模型之前不要部署特征仓库；对单个模型而言平台开销不合理"
    ],
    "case_study_company": "Airbnb",
    "case_study": "Airbnb built Zipline, its internal feature store, to serve 150+ ML models used in search ranking, price suggestion, fraud detection, and guest trust scoring. Before Zipline, data scientists spent 60% of their time on feature engineering; after, they reported spending under 20% because most features for a new model already existed in the store. Zipline's point-in-time correct training dataset generation prevented a class of training-serving skew bugs that had caused Airbnb's search ranking model to perform 8% worse in production than in offline evaluation. The platform processes 1.5 million feature computation jobs per day and serves features at p99 latency of 4ms.",
    "case_study_zh": "Airbnb构建了Zipline，其内部特征仓库，为搜索排名、价格建议、欺诈检测和房客信任评分中使用的150多个ML模型提供服务。在Zipline之前，数据科学家花费60%的时间在特征工程上；之后，他们报告花费不到20%，因为新模型的大多数特征已经存在于仓库中。Zipline的时间点正确训练数据集生成防止了一类训练与服务偏差错误，这些错误曾导致Airbnb的搜索排名模型在生产环境中比离线评估差8%。该平台每天处理150万个特征计算作业，以4ms的p99延迟提供特征。",
    "when_not_to_use": [
      "When an organization has fewer than 5 ML models in production; the operational overhead of a feature store exceeds its benefits at this scale",
      "When all ML models are batch-inference only with no real-time serving requirements; a simple feature engineering pipeline into the data warehouse is sufficient",
      "When data science teams are small (fewer than 5 people) and informal collaboration is sufficient to avoid duplicate feature engineering without platform overhead",
      "When ML models use unstructured data (images, text, audio) as primary inputs; feature stores are optimized for tabular entity-based features, not embedding stores"
    ],
    "when_not_to_use_zh": [
      "当组织在生产中有少于5个ML模型时；特征仓库的运营开销在此规模超过其收益",
      "当所有ML模型仅为批量推理且没有实时服务要求时；将简单特征工程管道导入数据仓库已足够",
      "当数据科学团队规模较小（少于5人）且非正式协作足以避免重复特征工程而无平台开销时",
      "当ML模型以非结构化数据（图像、文本、音频）作为主要输入时；特征仓库针对基于表格实体的特征进行优化，而非嵌入存储"
    ],
    "adopters": [
      "Uber",
      "Airbnb",
      "LinkedIn",
      "Twitter",
      "Stripe",
      "DoorDash"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Hermann, J. & Del Balso, M. (2017). 'Meet Michelangelo: Uber's Machine Learning Platform'. Uber Engineering Blog. eng.uber.com/michelangelo-machine-learning-platform.",
    "secondary_sources": [
      "Ganti, R. et al. (2020). 'Feast: Bridging ML Models and Data'. GoJek Engineering Blog. medium.com/gojekengineering.",
      "Zaharia, M. et al. (2022). 'Designing Machine Learning Systems'. O'Reilly Media. Chapter 5: Feature Engineering.",
      "Tecton (2021). 'The Feature Store: A Guide to ML Feature Management'. tecton.ai/blog."
    ],
    "typed_relations": [
      {
        "slug": "feature-store-pattern",
        "type": "related"
      },
      {
        "slug": "data-lakehouse",
        "type": "complement"
      },
      {
        "slug": "data-catalog",
        "type": "complement"
      },
      {
        "slug": "data-lineage-governance",
        "type": "complement"
      }
    ]
  },
  {
    "id": 310,
    "name": "Data Contract",
    "name_zh": "数据契约",
    "slug": "data-contract",
    "category": "data",
    "desc": "Formal, versioned agreements between data producers and consumers that specify schema, quality expectations, SLAs, and ownership, treating data as a product with explicit interface guarantees.",
    "desc_zh": "数据生产者和消费者之间的正式、版本化协议，规定模式、质量期望、SLA和所有权，将数据视为具有明确接口保证的产品。",
    "steps": [
      "Define the contract schema using a machine-readable format (YAML or JSON) specifying dataset name, owner, version, column definitions, data types, nullability, and primary key constraints",
      "Encode quality clauses: freshness SLA (maximum allowed data age), completeness thresholds (minimum non-null percentage per column), validity rules (value ranges, regex patterns, referential integrity)",
      "Establish semantic versioning for the contract (MAJOR for breaking changes, MINOR for additive changes) and publish to a contract registry visible to all consuming teams",
      "Implement contract validation in the producer's pipeline CI/CD so that schema and quality rule violations block deployment before bad data reaches consumers",
      "Monitor contract compliance at runtime with automated data quality checks and publish a contract health dashboard so consumers can assess data reliability before building dependent systems"
    ],
    "steps_zh": [
      "使用机器可读格式（YAML或JSON）定义契约模式，指定数据集名称、所有者、版本、列定义、数据类型、可空性和主键约束",
      "编码质量条款：新鲜度SLA（最大允许数据年龄）、完整性阈值（每列最低非空百分比）、有效性规则（值范围、正则表达式模式、引用完整性）",
      "为契约建立语义版本控制（破坏性变更用MAJOR，增量变更用MINOR），并发布到对所有消费团队可见的契约注册表",
      "在生产者的管道CI/CD中实施契约验证，使模式和质量规则违规在坏数据到达消费者之前阻止部署",
      "通过自动化数据质量检查在运行时监控契约合规性，并发布契约健康仪表板，使消费者在构建依赖系统之前能够评估数据可靠性"
    ],
    "ai_relevant": true,
    "viz_type": "matrix",
    "viz_labels": [
      "Schema Contract",
      "Quality Clauses",
      "Versioning",
      "CI Validation",
      "Runtime Monitor"
    ],
    "viz_labels_zh": [
      "模式契约",
      "质量条款",
      "版本控制",
      "CI验证",
      "运行时监控"
    ],
    "related": [
      "data-catalog",
      "data-quality-framework",
      "schema-registry",
      "data-mesh",
      "data-lineage-governance"
    ],
    "tags": [
      "data-contract",
      "data-quality",
      "producer-consumer",
      "data-mesh",
      "governance",
      "andrew-jones"
    ],
    "origin_author": "Andrew Jones",
    "origin_source": "Jones, A. (2022). 'Driving Data Quality with Data Contracts'. Substack / PayPal Engineering. Andrew Jones (2023). Driving Data Quality with Data Contracts. O'Reilly.",
    "origin_source_zh": "Jones, A.（2022）。「用数据契约驱动数据质量」。Substack / PayPal工程。Andrew Jones（2023）。《用数据契约驱动数据质量》。O'Reilly。",
    "complexity": "intermediate",
    "when_to_use": [
      "When data consumers (ML teams, analytics, downstream services) are repeatedly disrupted by silent schema changes or data quality regressions from upstream producer teams",
      "When implementing a data mesh architecture where domain teams publish data products and need explicit interface contracts to decouple producers from consumers",
      "When regulatory requirements (SOX, GDPR) demand documented data provenance and quality guarantees that can be audited independently of the producing team",
      "When the cost of data incidents (broken dashboards, bad model predictions, failed reports) is high enough to justify investing in prevention through formal producer accountability"
    ],
    "when_to_use_zh": [
      "当数据消费者（ML团队、分析、下游服务）反复因上游生产者团队的无声模式变更或数据质量回归而中断时",
      "当实施数据网格架构时，领域团队发布数据产品并需要明确的接口契约以将生产者与消费者解耦",
      "当监管要求（SOX、GDPR）要求有文档化的数据来源和质量保证，可以独立于生产团队进行审计时",
      "当数据事件（损坏的仪表板、错误的模型预测、失败的报告）的成本足够高，值得通过正式的生产者责任制进行预防性投资时"
    ],
    "core_concepts": [
      "Schema contract: a versioned, machine-readable specification of the dataset's structure (columns, types, nullability, primary keys) that acts as the API contract between a data producer and its consumers",
      "Quality contract: explicit, measurable data quality obligations including freshness SLA (e.g., data must be no older than 4 hours), completeness (e.g., user_id null rate < 0.1%), and validity rules",
      "Producer ownership model: the principle that the data producer team — not consumers — is responsible for authoring, versioning, and guaranteeing the contract, shifting accountability upstream",
      "Semantic versioning for data: applying MAJOR.MINOR.PATCH conventions to data schemas where MAJOR changes (column removal, type change) trigger a mandatory consumer migration workflow"
    ],
    "core_concepts_zh": [
      "模式契约：数据集结构（列、类型、可空性、主键）的版本化、机器可读规范，作为数据生产者与消费者之间的API契约",
      "质量契约：明确的、可衡量的数据质量义务，包括新鲜度SLA（如数据不得超过4小时）、完整性（如user_id空值率 < 0.1%）和有效性规则",
      "生产者所有权模型：数据生产者团队——而非消费者——负责创作、版本化和保证契约的原则，将责任向上游转移",
      "数据语义版本控制：将MAJOR.MINOR.PATCH约定应用于数据模式，其中MAJOR变更（删除列、类型变更）触发强制性消费者迁移工作流"
    ],
    "timeline": [
      [
        "2019",
        "Notion of 'data contracts' begins appearing in data engineering blogs as a term for informal producer-consumer agreements; not yet formalized"
      ],
      [
        "2022",
        "Andrew Jones (PayPal) publishes a seminal blog post 'Driving Data Quality with Data Contracts', popularizing the term and defining a practical framework"
      ],
      [
        "2023",
        "Open Data Contract Standard (ODCS) published by PayPal Engineering; Soda Core, Great Expectations, and dbt integrate data contract validation into their tooling"
      ],
      [
        "2024",
        "Data contracts become a cornerstone pattern of data mesh implementations; major cloud data platforms (Databricks, Snowflake) add native contract enforcement capabilities"
      ]
    ],
    "timeline_zh": [
      [
        "2019",
        "「数据契约」概念开始出现在数据工程博客中，作为非正式生产者-消费者协议的术语；尚未正式化"
      ],
      [
        "2022",
        "Andrew Jones（PayPal）发表具有里程碑意义的博客文章「用数据契约驱动数据质量」，普及该术语并定义了实用框架"
      ],
      [
        "2023",
        "PayPal工程发布开放数据契约标准（ODCS）；Soda Core、Great Expectations和dbt将数据契约验证集成到其工具中"
      ],
      [
        "2024",
        "数据契约成为数据网格实施的基石模式；主要云数据平台（Databricks、Snowflake）增加原生契约执行能力"
      ]
    ],
    "dos": [
      "Do author contracts in a machine-readable format (YAML/JSON) stored in git alongside the producer's pipeline code so that contract changes go through the same review process as code changes",
      "Do enforce contracts in the producer's CI/CD pipeline as a pre-deployment gate because post-deployment contract violations are already downstream incidents",
      "Do use semantic versioning and publish breaking change notices at least one sprint before the breaking schema change lands so consumers have time to adapt",
      "Do start with the highest-impact datasets (those feeding the most downstream consumers or regulatory reports) and expand contract coverage incrementally"
    ],
    "dos_zh": [
      "以机器可读格式（YAML/JSON）编写契约，与生产者的管道代码一起存储在git中，使契约变更经过与代码变更相同的审查流程",
      "在生产者的CI/CD管道中将契约作为部署前关卡强制执行，因为部署后的契约违规已经是下游事件",
      "使用语义版本控制，并在破坏性模式变更落地前至少提前一个迭代发布破坏性变更通知，给消费者留出适应时间",
      "从影响最大的数据集（那些为最多下游消费者或监管报告提供数据的数据集）开始，逐步扩展契约覆盖范围"
    ],
    "donts": [
      "Don't treat data contracts as documentation artifacts divorced from enforcement; an unverified contract is just a comment that will drift from reality over time",
      "Don't require consumers to author contracts on behalf of producers because producer-owned contracts align accountability; consumer-authored contracts create ownership ambiguity",
      "Don't apply uniform quality thresholds across all datasets; high-traffic production datasets warrant stricter SLAs than internal experimental tables",
      "Don't block all consumers from a dataset when a contract violation occurs; implement a tiered notification system that alerts consumers while the producer investigates"
    ],
    "donts_zh": [
      "不要将数据契约视为脱离执行的文档工件；未经验证的契约只是一个会随时间偏离现实的注释",
      "不要要求消费者代表生产者创作契约，因为生产者拥有的契约与责任一致；消费者创作的契约会产生所有权歧义",
      "不要对所有数据集应用统一的质量阈值；高流量生产数据集比内部实验表需要更严格的SLA",
      "不要在契约违规发生时阻止所有消费者访问数据集；实施分层通知系统，在生产者调查时告警消费者"
    ],
    "case_study_company": "PayPal",
    "case_study": "PayPal pioneered data contracts at scale after a 2021 data incident where an undocumented schema change to a payments dataset broke 47 downstream dashboards and delayed a regulatory filing. The engineering team, led by Andrew Jones, formalized data contracts as YAML specifications stored in the same Git repository as the producing pipeline. Each contract specifies schema, freshness SLA (payments data must be no older than 1 hour), nullability rates, and consumer registry. PayPal now enforces contracts as a CI/CD gate: any pipeline change that violates a contract is automatically blocked, and the affected consumer teams are notified. The program reduced data-incident-related engineering time by 70% within 18 months.",
    "case_study_zh": "PayPal在2021年一次数据事件后率先大规模推行数据契约，该事件中对支付数据集的未记录模式变更破坏了47个下游仪表板并延迟了监管申报。由Andrew Jones领导的工程团队将数据契约正式化为YAML规范，与生产管道存储在同一个Git仓库中。每个契约规定模式、新鲜度SLA（支付数据不得超过1小时）、可空性比率和消费者注册表。PayPal现在将契约作为CI/CD关卡执行：任何违反契约的管道变更都会自动被阻止，并通知受影响的消费者团队。该计划在18个月内将数据事件相关的工程时间减少了70%。",
    "when_not_to_use": [
      "Early-stage data exploration where schemas are intentionally fluid and imposing contract rigidity would slow down legitimate research and discovery work",
      "Internal experimental datasets consumed by a single team; the overhead of authoring and maintaining a formal contract is not justified without cross-team consumers",
      "When the organization lacks the tooling infrastructure to automate contract validation; manual contract checks are too unreliable to provide meaningful quality guarantees",
      "Real-time streaming pipelines requiring sub-second schema evolution; synchronous contract validation adds latency incompatible with high-throughput event streams"
    ],
    "when_not_to_use_zh": [
      "模式有意流动且强加契约刚性会减慢合理研究和发现工作的早期数据探索阶段",
      "由单个团队消费的内部实验数据集；在没有跨团队消费者的情况下，编写和维护正式契约的开销不合理",
      "当组织缺乏自动化契约验证的工具基础设施时；手动契约检查太不可靠，无法提供有意义的质量保证",
      "需要亚秒级模式演进的实时流管道；同步契约验证增加的延迟与高吞吐量事件流不兼容"
    ],
    "adopters": [
      "PayPal",
      "Spotify",
      "JPMC",
      "Airbnb",
      "Mercado Libre",
      "ING Bank"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "reliability",
      "maintainability",
      "testability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Jones, A. (2022). 'Driving Data Quality with Data Contracts'. andrewjones.substack.com.",
    "secondary_sources": [
      "PayPal Engineering (2023). 'Open Data Contract Standard (ODCS)'. github.com/paypal/data-contract-template.",
      "Majchrzak, T. et al. (2023). 'Data Contracts: A Missing Link in the Data Mesh'. SIGMOD 2023 Workshop on Data Management for End-to-End Machine Learning.",
      "Jones, A. (2023). Driving Data Quality with Data Contracts. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "data-mesh",
        "type": "complement"
      },
      {
        "slug": "data-quality-framework",
        "type": "extends"
      },
      {
        "slug": "schema-registry",
        "type": "complement"
      },
      {
        "slug": "data-catalog",
        "type": "complement"
      }
    ]
  },
  {
    "id": 111,
    "name": "Threat Modeling (STRIDE)",
    "name_zh": "威胁建模（STRIDE）",
    "slug": "threat-modeling-stride",
    "category": "security",
    "desc": "Systematic identification and mitigation of security threats using the STRIDE taxonomy",
    "desc_zh": "使用 STRIDE 分类法系统识别和缓解安全威胁",
    "steps": [
      "Decompose the system into components and draw a data flow diagram (DFD) with trust boundaries",
      "Apply STRIDE categories (Spoofing, Tampering, Repudiation, Information Disclosure, Denial of Service, Elevation of Privilege) to each element",
      "Enumerate concrete threats for every DFD element crossed with each STRIDE category",
      "Rate each threat using DREAD or CVSS scoring to prioritize mitigation efforts",
      "Define mitigations (authentication, integrity checks, logging, encryption, rate limiting, authorization) and map them to threats"
    ],
    "steps_zh": [
      "将系统分解为组件，绘制带信任边界的数据流图（DFD）",
      "对每个元素应用 STRIDE 六类威胁（仿冒、篡改、抵赖、信息泄露、拒绝服务、权限提升）",
      "针对 DFD 各元素与每个 STRIDE 类别交叉枚举具体威胁",
      "使用 DREAD 或 CVSS 评分对威胁进行优先级排序",
      "定义缓解措施（认证、完整性校验、日志记录、加密、限流、授权）并映射到对应威胁"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Spoofing",
      "Tampering",
      "Repudiation",
      "Info Disclosure",
      "Privilege Escalation"
    ],
    "viz_labels_zh": [
      "欺骗",
      "篡改",
      "否认",
      "信息泄露",
      "权限提升"
    ],
    "related": [
      "defense-in-depth",
      "security-by-design",
      "owasp-top-10"
    ],
    "tags": [
      "threat-modeling",
      "stride",
      "risk-assessment",
      "data-flow",
      "microsoft"
    ],
    "origin_author": "Loren Kohnfelder & Praerit Garg (Microsoft), 1999; popularized by Adam Shostack",
    "origin_source": "Threat Modeling: Designing for Security (Wiley, 2014) by Adam Shostack; Microsoft SDL threat modeling process",
    "origin_source_zh": "「Threat Modeling: Designing for Security」（Wiley, 2014）Adam Shostack 著；微软 SDL 威胁建模流程",
    "complexity": "intermediate",
    "when_to_use": [
      "During architecture and design phases to proactively identify security weaknesses before code is written",
      "When onboarding a new system or microservice into a security review program",
      "Before major releases to validate that new features don't introduce unmitigated threats",
      "In compliance-driven environments (PCI-DSS, HIPAA) that require documented risk assessments"
    ],
    "when_to_use_zh": [
      "在架构和设计阶段主动识别安全薄弱环节，在编码前发现问题",
      "将新系统或微服务纳入安全审查体系时",
      "重大发版前验证新功能未引入未缓解的威胁",
      "在合规驱动环境（PCI-DSS、HIPAA）中需要文档化风险评估时"
    ],
    "core_concepts": [
      "STRIDE: Six-category threat taxonomy — Spoofing, Tampering, Repudiation, Information Disclosure, Denial of Service, Elevation of Privilege",
      "Data Flow Diagram (DFD): Visual decomposition of the system into processes, data stores, external entities, and data flows",
      "Trust Boundary: A logical border where the level of trust changes, marking where threats are most likely to materialize",
      "Threat Tree: Hierarchical decomposition of a high-level threat into specific attack paths and preconditions",
      "Mitigation Mapping: The practice of linking each identified threat to one or more concrete security controls"
    ],
    "core_concepts_zh": [
      "STRIDE：六类威胁分类法——仿冒、篡改、抵赖、信息泄露、拒绝服务、权限提升",
      "数据流图（DFD）：将系统可视化分解为进程、数据存储、外部实体和数据流",
      "信任边界：信任级别发生变化的逻辑边界，标识威胁最可能出现的位置",
      "威胁树：将高层威胁分层分解为具体攻击路径和前置条件",
      "缓解映射：将每个已识别威胁关联到一个或多个具体安全控制措施"
    ],
    "timeline": [
      [
        "1999",
        "Loren Kohnfelder and Praerit Garg propose the STRIDE model internally at Microsoft"
      ],
      [
        "2002",
        "Microsoft integrates STRIDE into the Security Development Lifecycle (SDL)"
      ],
      [
        "2006",
        "Microsoft releases the free Threat Modeling Tool based on STRIDE methodology"
      ],
      [
        "2014",
        "Adam Shostack publishes「Threat Modeling: Designing for Security」, establishing industry best practices"
      ],
      [
        "2020",
        "OWASP Threat Modeling Playbook and community-driven tooling expand STRIDE beyond Microsoft ecosystems"
      ]
    ],
    "timeline_zh": [
      [
        "1999",
        "Loren Kohnfelder 和 Praerit Garg 在微软内部提出 STRIDE 模型"
      ],
      [
        "2002",
        "微软将 STRIDE 集成到安全开发生命周期（SDL）中"
      ],
      [
        "2006",
        "微软发布基于 STRIDE 方法论的免费威胁建模工具"
      ],
      [
        "2014",
        "Adam Shostack 出版「Threat Modeling: Designing for Security」，确立行业最佳实践"
      ],
      [
        "2020",
        "OWASP 威胁建模手册和社区驱动工具将 STRIDE 推广到微软生态之外"
      ]
    ],
    "dos": [
      "Start threat modeling early in design — retrofitting is far more expensive than proactive analysis",
      "Involve cross-functional participants (developers, architects, QA, operations) to surface diverse threat perspectives",
      "Keep data flow diagrams up to date as the architecture evolves across sprints",
      "Use the threat model as a living document that feeds into backlog items and test plans"
    ],
    "dos_zh": [
      "尽早在设计阶段开始威胁建模——事后补救的成本远高于主动分析",
      "让跨职能成员（开发、架构、QA、运维）参与，汇集多角度威胁视角",
      "随架构迭代及时更新数据流图",
      "将威胁模型作为活文档，反馈到需求清单和测试计划中"
    ],
    "donts": [
      "Don't treat threat modeling as a one-time checkbox exercise — threats evolve as the system changes",
      "Don't attempt to enumerate every conceivable threat in one session — focus on high-risk trust boundaries first",
      "Don't skip the data flow diagram — without it STRIDE analysis becomes unstructured guesswork",
      "Don't confuse threat modeling with penetration testing — modeling is design-time, pen-testing is runtime validation"
    ],
    "donts_zh": [
      "不要把威胁建模当作一次性的合规勾选项——威胁随系统变化而演变",
      "不要试图在一次会议中穷举所有威胁——优先关注高风险信任边界",
      "不要跳过数据流图——缺少它 STRIDE 分析将变成无结构的猜测",
      "不要混淆威胁建模与渗透测试——建模在设计时进行，渗透测试是运行时验证"
    ],
    "case_study_company": "Microsoft",
    "case_study": "Microsoft mandated STRIDE-based threat modeling across all product teams as part of the Security Development Lifecycle (SDL) starting in 2004. After integrating threat modeling into the Windows Vista development cycle, the team identified and mitigated over 100 high-severity threats before release, contributing to a 45% reduction in security bulletins compared to Windows XP's equivalent post-release period.",
    "case_study_zh": "微软自 2004 年起在安全开发生命周期（SDL）中强制要求所有产品团队进行基于 STRIDE 的威胁建模。在 Windows Vista 开发周期中引入威胁建模后，团队在发布前识别并缓解了超过 100 个高危威胁，与 Windows XP 同期相比安全公告数量减少了 45%。",
    "when_not_to_use": [
      "Throwaway prototypes or hackathon projects where security analysis overhead is unjustified",
      "Purely static content sites with no user input, authentication, or sensitive data",
      "When the team lacks basic security vocabulary — invest in training first before formal threat modeling",
      "Extremely small scripts or CLI utilities with no network exposure and no privilege boundaries"
    ],
    "when_not_to_use_zh": [
      "一次性原型或黑客松项目，安全分析开销不合理时",
      "没有用户输入、认证或敏感数据的纯静态内容站点",
      "团队缺乏基本安全词汇时——应先进行培训再开展正式威胁建模",
      "没有网络暴露和权限边界的极小脚本或 CLI 工具"
    ],
    "adopters": [
      "Microsoft",
      "Google (variation: process for attack simulation and threat analysis)",
      "OWASP",
      "Intel",
      "SAFECode"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "security",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Shostack, A. (2014). \"Threat Modeling: Designing for Security\". Wiley.",
    "secondary_sources": [
      "Kohnfelder, L. & Garg, P. (1999). \"The Threats to Our Products\". Microsoft Internal Document.",
      "Howard, M. & Lipner, S. (2006). \"The Security Development Lifecycle\". Microsoft Press.",
      "OWASP (2020). \"Threat Modeling Playbook\". owasp.org."
    ],
    "typed_relations": [
      {
        "slug": "defense-in-depth",
        "type": "complement"
      },
      {
        "slug": "security-by-design",
        "type": "complement"
      },
      {
        "slug": "owasp-top-10",
        "type": "complement"
      }
    ]
  },
  {
    "id": 112,
    "name": "Zero Trust Architecture",
    "name_zh": "零信任架构",
    "slug": "zero-trust-architecture",
    "category": "security",
    "desc": "Never trust, always verify — eliminate implicit trust from network architecture",
    "desc_zh": "永不信任、持续验证——消除网络架构中的隐式信任",
    "steps": [
      "Identify all protect surfaces: critical data, assets, applications, and services (DAAS)",
      "Map transaction flows to understand how subjects access each protect surface",
      "Build a Zero Trust policy engine that evaluates identity, device health, context, and behavior for every request",
      "Enforce micro-segmentation at the network and application layer so lateral movement is blocked by default",
      "Continuously monitor and log all access decisions; feed analytics back into adaptive policies"
    ],
    "steps_zh": [
      "识别所有保护面：关键数据、资产、应用和服务（DAAS）",
      "映射事务流以理解主体如何访问各保护面",
      "构建零信任策略引擎，对每个请求评估身份、设备健康状态、上下文和行为",
      "在网络层和应用层实施微分段，默认阻断横向移动",
      "持续监控和记录所有访问决策，将分析结果反馈到自适应策略中"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Protect Surface",
      "Transaction Flow",
      "Policy Engine",
      "Micro-Segment",
      "Monitor"
    ],
    "viz_labels_zh": [
      "保护面",
      "事务流",
      "策略引擎",
      "微隔离",
      "持续监控"
    ],
    "related": [
      "principle-of-least-privilege",
      "defense-in-depth",
      "oauth2-openid-connect"
    ],
    "tags": [
      "zero-trust",
      "network-security",
      "identity",
      "micro-segmentation",
      "access-control"
    ],
    "origin_author": "John Kindervag (Forrester Research), 2010",
    "origin_source": "Forrester Research report「No More Chewy Centers: Introducing the Zero Trust Model」(2010); NIST SP 800-207「Zero Trust Architecture」(2020)",
    "origin_source_zh": "Forrester 研究报告「No More Chewy Centers: Introducing the Zero Trust Model」(2010)；NIST SP 800-207「零信任架构」(2020)",
    "complexity": "advanced",
    "when_to_use": [
      "Enterprise environments with hybrid cloud and remote workforce where perimeter-based security is insufficient",
      "Organizations handling sensitive data (financial, healthcare, government) requiring granular access control",
      "Post-breach remediation where lateral movement was the primary attack vector",
      "Mergers and acquisitions requiring rapid, secure integration of disparate network environments"
    ],
    "when_to_use_zh": [
      "混合云和远程办公的企业环境中，边界安全已不足以防护时",
      "处理敏感数据（金融、医疗、政府）需要细粒度访问控制的组织",
      "横向移动是主要攻击向量的入侵后补救场景",
      "并购场景中需要快速安全地整合不同网络环境时"
    ],
    "core_concepts": [
      "Never Trust, Always Verify: Every access request must be authenticated, authorized, and encrypted regardless of network location",
      "Micro-segmentation: Dividing the network into fine-grained zones so each workload or resource is isolated behind its own access policy",
      "Policy Decision Point (PDP): The central engine that evaluates context signals (identity, device posture, location, risk score) to allow or deny access",
      "Policy Enforcement Point (PEP): The gateway that enforces the PDP's decision at the network or application boundary",
      "Continuous Verification: Ongoing assessment of trust during a session, not just at initial authentication"
    ],
    "core_concepts_zh": [
      "永不信任、持续验证：无论网络位置如何，每个访问请求都必须经过认证、授权和加密",
      "微分段：将网络划分为细粒度区域，每个工作负载或资源都在独立的访问策略后隔离",
      "策略决策点（PDP）：评估上下文信号（身份、设备状态、位置、风险评分）以允许或拒绝访问的核心引擎",
      "策略执行点（PEP）：在网络或应用边界执行 PDP 决策的网关",
      "持续验证：在会话期间持续评估信任，而非仅在初始认证时验证"
    ],
    "timeline": [
      [
        "2010",
        "John Kindervag publishes the Zero Trust model at Forrester Research"
      ],
      [
        "2014",
        "Google publishes BeyondCorp papers, validating Zero Trust at massive scale"
      ],
      [
        "2020-08",
        "NIST releases SP 800-207「Zero Trust Architecture」as the definitive reference standard"
      ],
      [
        "2021-05",
        "US Executive Order 14028 mandates federal agencies adopt Zero Trust architecture"
      ],
      [
        "2023",
        "Major cloud providers (Azure AD Conditional Access, Google BeyondCorp Enterprise, AWS Verified Access) offer native Zero Trust products"
      ]
    ],
    "timeline_zh": [
      [
        "2010",
        "John Kindervag 在 Forrester Research 发布零信任模型"
      ],
      [
        "2014",
        "Google 发布 BeyondCorp 论文，在超大规模中验证零信任"
      ],
      [
        "2020-08",
        "NIST 发布 SP 800-207「零信任架构」作为权威参考标准"
      ],
      [
        "2021-05",
        "美国第 14028 号行政令要求联邦机构采用零信任架构"
      ],
      [
        "2023",
        "主要云厂商（Azure AD 条件访问、Google BeyondCorp Enterprise、AWS Verified Access）推出原生零信任产品"
      ]
    ],
    "dos": [
      "Start with your most critical protect surfaces rather than attempting a full network transformation at once",
      "Integrate identity-aware proxies and device trust signals into every access decision",
      "Invest in comprehensive logging and SIEM integration — Zero Trust is only as strong as your visibility",
      "Treat Zero Trust as a journey with incremental maturity levels, not a single product purchase"
    ],
    "dos_zh": [
      "从最关键的保护面开始，而非试图一次性转变整个网络",
      "将身份感知代理和设备信任信号集成到每个访问决策中",
      "投资全面的日志记录和 SIEM 集成——零信任的强度取决于可见性",
      "将零信任视为具有渐进成熟度级别的旅程，而非一次性产品采购"
    ],
    "donts": [
      "Don't assume VPN equals Zero Trust — VPN still grants broad network access once connected",
      "Don't neglect user experience — overly aggressive verification causes friction and workaround behaviors",
      "Don't forget east-west traffic — Zero Trust must apply inside the network, not just at the perimeter",
      "Don't implement Zero Trust without executive sponsorship — it requires cross-team coordination and budget"
    ],
    "donts_zh": [
      "不要假设 VPN 等于零信任——VPN 连接后仍授予广泛网络访问权限",
      "不要忽视用户体验——过度激进的验证会造成摩擦和绕行行为",
      "不要忽略东西向流量——零信任必须应用于网络内部，而非仅限边界",
      "不要在没有高层支持的情况下实施零信任——它需要跨团队协调和预算"
    ],
    "case_study_company": "Google",
    "case_study": "Google pioneered Zero Trust at enterprise scale with BeyondCorp, launched after the 2009 Operation Aurora attacks. By eliminating the privileged corporate network and requiring every request to be authenticated and authorized based on user identity and device state, Google enabled 100,000+ employees to work securely from any network without VPN. BeyondCorp became the template for the entire industry's Zero Trust adoption.",
    "case_study_zh": "Google 在 2009 年「极光行动」攻击后率先以 BeyondCorp 在企业级大规模实施零信任。通过消除特权企业网络，要求每个请求基于用户身份和设备状态进行认证与授权，Google 使 10 万余名员工无需 VPN 即可从任何网络安全工作。BeyondCorp 成为整个行业零信任采用的模板。",
    "when_not_to_use": [
      "Air-gapped networks with no external connectivity where perimeter isolation is physically enforced",
      "Very small teams (under 10) with a single flat network and no sensitive data — the overhead may exceed the risk",
      "Legacy OT/ICS environments where devices cannot support modern authentication protocols",
      "Short-lived experimental environments that will be destroyed before any real data flows through them"
    ],
    "when_not_to_use_zh": [
      "无外部连接的气隙网络，物理隔离已强制执行",
      "极小团队（10 人以下）单一扁平网络且无敏感数据时——开销可能超过风险",
      "无法支持现代认证协议的传统 OT/ICS 环境",
      "在真实数据流入之前就会销毁的短期实验环境"
    ],
    "adopters": [
      "Google (BeyondCorp)",
      "Microsoft (Azure AD Conditional Access)",
      "Cloudflare (Cloudflare Access)",
      "Zscaler",
      "US Federal Government (per EO 14028)"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "security",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Rose, S. et al. (2020). \"Zero Trust Architecture\". NIST Special Publication 800-207.",
    "secondary_sources": [
      "Kindervag, J. (2010). \"No More Chewy Centers: Introducing the Zero Trust Model of Information Security\". Forrester Research.",
      "Ward, R. & Beyer, B. (2014). \"BeyondCorp: A New Approach to Enterprise Security\". ;login: USENIX, 39(6)."
    ],
    "typed_relations": [
      {
        "slug": "principle-of-least-privilege",
        "type": "prerequisite"
      },
      {
        "slug": "defense-in-depth",
        "type": "complement"
      },
      {
        "slug": "oauth2-openid-connect",
        "type": "complement"
      }
    ]
  },
  {
    "id": 113,
    "name": "OAuth 2.0 / OpenID Connect",
    "name_zh": "OAuth 2.0 / OpenID Connect 授权与身份框架",
    "slug": "oauth2-openid-connect",
    "category": "security",
    "desc": "Delegated authorization and federated identity for secure API and application access",
    "desc_zh": "用于安全 API 和应用访问的委托授权与联合身份框架",
    "steps": [
      "Register the client application with the authorization server and obtain client credentials",
      "Redirect the resource owner to the authorization endpoint with the appropriate grant type and scopes",
      "The authorization server authenticates the user and issues an authorization code (or token directly for implicit flows)",
      "The client exchanges the authorization code for an access token (and optionally a refresh token and ID token via OpenID Connect)",
      "The client presents the access token to the resource server; the server validates the token and enforces scope-based access control"
    ],
    "steps_zh": [
      "在授权服务器注册客户端应用并获取客户端凭据",
      "使用适当的授权类型和作用域将资源所有者重定向到授权端点",
      "授权服务器认证用户并颁发授权码（隐式流直接颁发令牌）",
      "客户端用授权码换取访问令牌（可选地通过 OpenID Connect 获取刷新令牌和 ID 令牌）",
      "客户端向资源服务器出示访问令牌；服务器验证令牌并执行基于作用域的访问控制"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Client",
      "Auth Server",
      "Auth Code",
      "Access Token",
      "Resource Server"
    ],
    "viz_labels_zh": [
      "客户端",
      "授权服务器",
      "授权码",
      "访问令牌",
      "资源服务器"
    ],
    "related": [
      "zero-trust-architecture",
      "principle-of-least-privilege",
      "security-by-design"
    ],
    "tags": [
      "oauth",
      "openid-connect",
      "authorization",
      "identity",
      "tokens",
      "api-security"
    ],
    "origin_author": "Eran Hammer, Dick Hardt et al. (IETF); OpenID Foundation",
    "origin_source": "RFC 6749「The OAuth 2.0 Authorization Framework」(IETF, 2012); RFC 6750 Bearer Token Usage; OpenID Connect Core 1.0 (2014)",
    "origin_source_zh": "RFC 6749「OAuth 2.0 授权框架」(IETF, 2012)；RFC 6750 Bearer 令牌用法；OpenID Connect Core 1.0 (2014)",
    "complexity": "intermediate",
    "when_to_use": [
      "Building APIs that need delegated third-party access without sharing user credentials",
      "Implementing single sign-on (SSO) across multiple applications or microservices",
      "Mobile and SPA applications that require secure token-based authentication",
      "Integrating with external identity providers (Google, Microsoft, Apple) for federated login"
    ],
    "when_to_use_zh": [
      "构建需要委托第三方访问而不共享用户凭据的 API",
      "在多个应用或微服务间实现单点登录（SSO）",
      "需要安全的基于令牌认证的移动和单页应用",
      "集成外部身份提供商（Google、Microsoft、Apple）实现联合登录"
    ],
    "core_concepts": [
      "Authorization Grant: A credential representing the resource owner's consent, exchanged for an access token (e.g., authorization code, client credentials)",
      "Access Token: A short-lived credential that the client presents to access protected resources on behalf of the user",
      "Refresh Token: A long-lived credential used to obtain new access tokens without requiring the user to re-authenticate",
      "ID Token (OIDC): A JWT containing identity claims about the authenticated user, issued alongside the access token in OpenID Connect flows",
      "Scopes: Named permissions that limit what an access token can do, enabling the principle of least privilege at the API level"
    ],
    "core_concepts_zh": [
      "授权许可：代表资源所有者同意的凭据，用于换取访问令牌（如授权码、客户端凭据）",
      "访问令牌：客户端出示以代表用户访问受保护资源的短期凭据",
      "刷新令牌：用于获取新访问令牌而无需用户重新认证的长期凭据",
      "ID 令牌（OIDC）：包含已认证用户身份声明的 JWT，在 OpenID Connect 流程中与访问令牌一起颁发",
      "作用域：限制访问令牌操作范围的命名权限，在 API 层面实现最小权限原则"
    ],
    "timeline": [
      [
        "2007",
        "OAuth 1.0 published as an informal community specification for delegated authorization"
      ],
      [
        "2012-10",
        "OAuth 2.0 (RFC 6749) published by IETF, simplifying the protocol and supporting multiple grant types"
      ],
      [
        "2014-02",
        "OpenID Connect Core 1.0 ratified, adding an identity layer on top of OAuth 2.0"
      ],
      [
        "2020",
        "OAuth 2.1 draft consolidates best practices, deprecating implicit grant and recommending PKCE for all clients"
      ],
      [
        "2023",
        "Rich Authorization Requests (RAR, RFC 9396) extends OAuth with fine-grained, structured authorization"
      ]
    ],
    "timeline_zh": [
      [
        "2007",
        "OAuth 1.0 作为委托授权的非正式社区规范发布"
      ],
      [
        "2012-10",
        "OAuth 2.0（RFC 6749）由 IETF 发布，简化协议并支持多种授权类型"
      ],
      [
        "2014-02",
        "OpenID Connect Core 1.0 批准，在 OAuth 2.0 之上添加身份层"
      ],
      [
        "2020",
        "OAuth 2.1 草案整合最佳实践，弃用隐式授权并建议所有客户端使用 PKCE"
      ],
      [
        "2023",
        "富授权请求（RAR, RFC 9396）扩展 OAuth，支持细粒度结构化授权"
      ]
    ],
    "dos": [
      "Always use PKCE (Proof Key for Code Exchange) for public clients — it prevents authorization code interception attacks",
      "Validate tokens server-side using the authorization server's JWKS endpoint or introspection endpoint",
      "Keep access token lifetimes short (minutes) and use refresh tokens for long-lived sessions",
      "Restrict scopes to the minimum required — over-scoped tokens magnify the blast radius of a token leak"
    ],
    "dos_zh": [
      "公共客户端务必使用 PKCE（授权码交换证明密钥）——防止授权码拦截攻击",
      "使用授权服务器的 JWKS 端点或内省端点在服务端验证令牌",
      "保持访问令牌生命周期短（分钟级）并使用刷新令牌维持长期会话",
      "将作用域限制到最小必需——过宽的令牌作用域会放大令牌泄露的爆炸半径"
    ],
    "donts": [
      "Don't use the implicit grant flow — it exposes tokens in the URL fragment and is deprecated in OAuth 2.1",
      "Don't store tokens in localStorage — use httpOnly secure cookies or in-memory storage to limit XSS exposure",
      "Don't treat the access token as proof of identity — use the ID token from OpenID Connect for authentication",
      "Don't skip the state parameter — without it the flow is vulnerable to CSRF attacks"
    ],
    "donts_zh": [
      "不要使用隐式授权流——它在 URL 片段中暴露令牌，已在 OAuth 2.1 中弃用",
      "不要将令牌存储在 localStorage——使用 httpOnly 安全 Cookie 或内存存储以限制 XSS 风险",
      "不要将访问令牌当作身份证明——使用 OpenID Connect 的 ID 令牌进行认证",
      "不要跳过 state 参数——缺少它流程将易受 CSRF 攻击"
    ],
    "case_study_company": "Okta",
    "case_study": "Okta built its entire identity platform on OAuth 2.0 and OpenID Connect, serving over 18,000 enterprise customers by 2023. By providing a standards-based authorization server with pre-built integrations for thousands of applications, Okta reduced enterprise SSO deployment time from months to days. Their acquisition of Auth0 in 2021 further cemented OAuth/OIDC as the universal developer-facing identity protocol.",
    "case_study_zh": "Okta 在 OAuth 2.0 和 OpenID Connect 之上构建了整个身份平台，到 2023 年服务超过 18,000 家企业客户。通过提供基于标准的授权服务器和数千个应用的预构建集成，Okta 将企业 SSO 部署时间从数月缩短到数天。2021 年收购 Auth0 进一步巩固了 OAuth/OIDC 作为面向开发者的通用身份协议地位。",
    "when_not_to_use": [
      "Internal machine-to-machine communication where mutual TLS (mTLS) is simpler and more appropriate",
      "Simple single-application systems with no third-party integrations where session-based auth suffices",
      "Extremely constrained IoT devices that cannot perform HTTP redirects or handle token management",
      "When you need transaction-level authorization — OAuth scopes are coarse; consider UMA or RAR for fine-grained control"
    ],
    "when_not_to_use_zh": [
      "双向 TLS（mTLS）更简单适用的内部机器间通信",
      "无第三方集成的简单单应用系统，会话认证已足够",
      "无法执行 HTTP 重定向或处理令牌管理的极度受限 IoT 设备",
      "需要事务级授权时——OAuth 作用域粒度较粗，应考虑 UMA 或 RAR 进行细粒度控制"
    ],
    "adopters": [
      "Google",
      "Microsoft (Azure AD / Entra ID)",
      "Okta / Auth0",
      "GitHub",
      "Salesforce"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "security",
      "usability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Hardt, D. (2012). \"The OAuth 2.0 Authorization Framework\". IETF RFC 6749.",
    "secondary_sources": [
      "Sakimura, N. et al. (2014). \"OpenID Connect Core 1.0\". OpenID Foundation.",
      "Jones, M. & Hardt, D. (2012). \"The OAuth 2.0 Authorization Framework: Bearer Token Usage\". IETF RFC 6750.",
      "Richer, J. & Sanso, A. (2017). \"OAuth 2 in Action\". Manning Publications."
    ],
    "typed_relations": [
      {
        "slug": "zero-trust-architecture",
        "type": "complement"
      },
      {
        "slug": "principle-of-least-privilege",
        "type": "prerequisite"
      },
      {
        "slug": "security-by-design",
        "type": "complement"
      }
    ]
  },
  {
    "id": 114,
    "name": "Defense in Depth",
    "name_zh": "纵深防御",
    "slug": "defense-in-depth",
    "category": "security",
    "desc": "Layer multiple independent security controls so that no single point of failure compromises the system",
    "desc_zh": "分层部署多个独立安全控制，确保任何单点故障不会导致整个系统沦陷",
    "steps": [
      "Identify all system layers: physical, network, host, application, data, and human",
      "For each layer, define independent security controls (firewalls, WAFs, encryption, access controls, training)",
      "Ensure controls at adjacent layers compensate for each other's weaknesses — no two layers rely on the same mechanism",
      "Implement monitoring and alerting at each layer to detect failures or bypasses in real time",
      "Regularly test the layered defenses via red-team exercises and verify that breaching one layer triggers compensating controls"
    ],
    "steps_zh": [
      "识别所有系统层：物理层、网络层、主机层、应用层、数据层和人员层",
      "为每一层定义独立的安全控制（防火墙、WAF、加密、访问控制、培训）",
      "确保相邻层的控制措施互相补偿对方的弱点——任意两层不依赖同一机制",
      "在每一层实施监控和告警，实时检测故障或绕过",
      "通过红队演练定期测试分层防御，验证突破一层后是否触发补偿控制"
    ],
    "ai_relevant": false,
    "viz_type": "pyramid",
    "viz_labels": [
      "Physical",
      "Network",
      "Host",
      "Application",
      "Data"
    ],
    "viz_labels_zh": [
      "物理层",
      "网络层",
      "主机层",
      "应用层",
      "数据层"
    ],
    "related": [
      "threat-modeling-stride",
      "zero-trust-architecture",
      "security-by-design"
    ],
    "tags": [
      "defense-in-depth",
      "layered-security",
      "controls",
      "risk-management",
      "nist"
    ],
    "origin_author": "Military strategy concept; adapted to information security by NSA and NIST",
    "origin_source": "NSA Information Assurance Technical Framework (IATF); NIST SP 800-53「Security and Privacy Controls for Information Systems and Organizations」",
    "origin_source_zh": "NSA 信息保障技术框架（IATF）；NIST SP 800-53「信息系统和组织的安全与隐私控制」",
    "complexity": "beginner",
    "when_to_use": [
      "Designing the overall security posture for any non-trivial production system",
      "When compliance frameworks (SOC 2, ISO 27001, PCI-DSS) require demonstrable layered controls",
      "After a security incident reveals that a single control was the sole line of defense",
      "Cloud migration projects where the shared responsibility model demands explicit controls at each layer"
    ],
    "when_to_use_zh": [
      "为任何非平凡生产系统设计整体安全态势时",
      "合规框架（SOC 2、ISO 27001、PCI-DSS）要求可证明的分层控制时",
      "安全事件揭示某单一控制措施是唯一防线之后",
      "共享责任模型要求每层有明确控制的云迁移项目"
    ],
    "core_concepts": [
      "Layered Controls: Independent security mechanisms applied at physical, network, host, application, and data layers",
      "Compensating Controls: Backup security measures that activate when a primary control fails or is bypassed",
      "Fail-Secure: Designing systems to deny access by default when a security control encounters an error",
      "Diversity of Defense: Using different technologies and vendors at each layer to prevent a single vulnerability from cascading",
      "Security Monitoring: Continuous observation at every layer to detect anomalies and trigger incident response"
    ],
    "core_concepts_zh": [
      "分层控制：在物理、网络、主机、应用和数据各层应用独立安全机制",
      "补偿控制：当主控制失效或被绕过时启动的备份安全措施",
      "安全失败：设计系统在安全控制遇到错误时默认拒绝访问",
      "防御多样性：在各层使用不同技术和供应商，防止单一漏洞级联",
      "安全监控：在每一层持续观察，检测异常并触发事件响应"
    ],
    "timeline": [
      [
        "1990s",
        "NSA Information Assurance Technical Framework (IATF) formalizes Defense in Depth for IT systems"
      ],
      [
        "2005",
        "NIST SP 800-53 Rev 1 codifies layered security controls as a federal standard"
      ],
      [
        "2013",
        "Target breach (HVAC vendor to POS) becomes a textbook case of insufficient defense in depth"
      ],
      [
        "2017",
        "NIST Cybersecurity Framework v1.1 reinforces layered controls across Identify, Protect, Detect, Respond, Recover"
      ],
      [
        "2023",
        "Cloud-native defense in depth evolves with service mesh policies, eBPF-based monitoring, and runtime security"
      ]
    ],
    "timeline_zh": [
      [
        "1990s",
        "NSA 信息保障技术框架（IATF）将纵深防御正式化用于 IT 系统"
      ],
      [
        "2005",
        "NIST SP 800-53 Rev 1 将分层安全控制编入联邦标准"
      ],
      [
        "2013",
        "Target 数据泄露（从 HVAC 供应商到 POS）成为纵深防御不足的教科书案例"
      ],
      [
        "2017",
        "NIST 网络安全框架 v1.1 在识别、保护、检测、响应、恢复各功能中加强分层控制"
      ],
      [
        "2023",
        "云原生纵深防御随服务网格策略、基于 eBPF 的监控和运行时安全而演进"
      ]
    ],
    "dos": [
      "Ensure each layer provides independent value — if removing one layer doesn't change your risk posture, it's redundant not defense in depth",
      "Document the purpose and coverage of each control layer for incident responders and auditors",
      "Include human-layer defenses (security awareness training, phishing simulations) alongside technical controls",
      "Test layers in isolation and in combination to validate that failures cascade into detection, not into compromise"
    ],
    "dos_zh": [
      "确保每层提供独立价值——如果移除某层不改变风险态势，它是冗余而非纵深防御",
      "为事件响应人员和审计员文档化每个控制层的目的和覆盖范围",
      "在技术控制之外包含人员层防御（安全意识培训、钓鱼模拟）",
      "单独和组合测试各层，验证故障级联进入检测环节而非导致入侵"
    ],
    "donts": [
      "Don't equate more layers with better security — poorly configured layers create complexity without protection",
      "Don't rely on a single vendor for all layers — a vulnerability in their stack compromises every layer simultaneously",
      "Don't neglect the data layer — encryption at rest and in transit is often the last line of defense after all other controls fail",
      "Don't assume network controls alone constitute defense in depth — application-layer and identity controls are equally critical"
    ],
    "donts_zh": [
      "不要将更多层等同于更好的安全——配置不当的层只增加复杂度而不提供保护",
      "不要依赖单一供应商覆盖所有层——其技术栈中的漏洞将同时危及每一层",
      "不要忽视数据层——静态和传输加密通常是其他控制全部失效后的最后防线",
      "不要假设网络控制单独构成纵深防御——应用层和身份控制同样关键"
    ],
    "case_study_company": "Target",
    "case_study": "The 2013 Target data breach exposed 40 million credit card records when attackers compromised an HVAC vendor's credentials and moved laterally to the point-of-sale network. The breach demonstrated catastrophic failure of defense in depth: flat network segmentation, insufficient monitoring between zones, and no data-layer encryption on cardholder data. Post-breach, Target invested over $200 million in layered security controls including network segmentation, endpoint detection, and tokenization.",
    "case_study_zh": "2013 年 Target 数据泄露事件中，攻击者入侵了 HVAC 供应商凭据后横向移动到 POS 网络，暴露了 4000 万张信用卡记录。该事件展示了纵深防御的灾难性失败：扁平网络分段、区域间监控不足、持卡人数据缺乏数据层加密。事后 Target 投入超过 2 亿美元建设分层安全控制，包括网络分段、端点检测和令牌化。",
    "when_not_to_use": [
      "Minimal-risk hobby projects where any security control is optional",
      "When resource constraints force a choice between one strong control or many weak controls — choose one strong control",
      "Extremely simple systems (static pages, read-only public data) where a single layer of access control suffices",
      "When adding layers introduces latency or complexity that exceeds the risk being mitigated"
    ],
    "when_not_to_use_zh": [
      "安全控制为可选项的极低风险个人项目",
      "资源约束迫使在一个强控制和多个弱控制之间选择时——选择一个强控制",
      "单层访问控制即可满足的极简系统（静态页面、只读公开数据）",
      "增加层级引入的延迟或复杂度超过所缓解风险时"
    ],
    "adopters": [
      "US Department of Defense",
      "NIST",
      "AWS (shared responsibility model)",
      "JPMorgan Chase",
      "Cloudflare"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "security",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "NSA (2010). \"Information Assurance Technical Framework (IATF)\". National Security Agency.",
    "secondary_sources": [
      "NIST (2020). \"Security and Privacy Controls for Information Systems and Organizations\". NIST Special Publication 800-53, Rev. 5.",
      "Schneier, B. (2000). \"Secrets and Lies: Digital Security in a Networked World\". Wiley."
    ],
    "typed_relations": [
      {
        "slug": "threat-modeling-stride",
        "type": "complement"
      },
      {
        "slug": "zero-trust-architecture",
        "type": "complement"
      },
      {
        "slug": "security-by-design",
        "type": "complement"
      }
    ]
  },
  {
    "id": 115,
    "name": "Privacy by Design",
    "name_zh": "隐私设计",
    "slug": "privacy-by-design",
    "category": "security",
    "desc": "Embed privacy protections into the design and architecture of systems from the outset, not as an afterthought",
    "desc_zh": "从一开始就将隐私保护嵌入系统的设计和架构中，而非事后补救",
    "steps": [
      "Conduct a Privacy Impact Assessment (PIA) at the design phase to identify all personal data flows and processing purposes",
      "Apply data minimization: collect only the data strictly necessary for each stated purpose",
      "Implement privacy-enhancing technologies (PETs): pseudonymization, encryption, differential privacy, or federated processing",
      "Design user-facing consent and control mechanisms (granular opt-in, data portability, deletion requests)",
      "Establish ongoing privacy governance: periodic audits, retention enforcement, breach notification procedures, and DPO oversight"
    ],
    "steps_zh": [
      "在设计阶段进行隐私影响评估（PIA），识别所有个人数据流和处理目的",
      "应用数据最小化原则：仅收集每个声明目的所严格必需的数据",
      "实施隐私增强技术（PET）：假名化、加密、差分隐私或联邦处理",
      "设计面向用户的同意和控制机制（细粒度选择加入、数据可携带、删除请求）",
      "建立持续隐私治理：定期审计、保留期执行、泄露通知流程和 DPO 监督"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "PIA",
      "Data Minimization",
      "PETs",
      "User Consent",
      "Privacy Governance"
    ],
    "viz_labels_zh": [
      "隐私评估",
      "数据最小化",
      "隐私技术",
      "用户同意",
      "隐私治理"
    ],
    "related": [
      "security-by-design",
      "principle-of-least-privilege",
      "defense-in-depth"
    ],
    "tags": [
      "privacy",
      "gdpr",
      "data-protection",
      "pbd",
      "data-minimization",
      "consent"
    ],
    "origin_author": "Ann Cavoukian (Information and Privacy Commissioner of Ontario), 1990s; formalized 2009",
    "origin_source": "Ann Cavoukian「Privacy by Design: The 7 Foundational Principles」(2009); GDPR Article 25「Data protection by design and by default」(2016/2018)",
    "origin_source_zh": "Ann Cavoukian「隐私设计：七项基础原则」(2009)；GDPR 第 25 条「设计和默认的数据保护」(2016/2018)",
    "complexity": "intermediate",
    "when_to_use": [
      "Any system that collects, processes, or stores personally identifiable information (PII)",
      "Products subject to GDPR, CCPA, LGPD, or other data protection regulations",
      "AI/ML pipelines that train on user data and risk unintentional memorization or re-identification",
      "Healthcare, fintech, or edtech applications where data sensitivity is inherently high"
    ],
    "when_to_use_zh": [
      "任何收集、处理或存储个人身份信息（PII）的系统",
      "受 GDPR、CCPA、LGPD 或其他数据保护法规管辖的产品",
      "在用户数据上训练且存在无意记忆或重新识别风险的 AI/ML 管道",
      "数据敏感度固有较高的医疗、金融科技或教育科技应用"
    ],
    "core_concepts": [
      "Data Minimization: Collect and retain only the personal data strictly necessary for the stated processing purpose",
      "Purpose Limitation: Personal data must be collected for specified, explicit, and legitimate purposes and not further processed incompatibly",
      "Privacy-Enhancing Technologies (PETs): Technical measures — pseudonymization, differential privacy, homomorphic encryption — that reduce privacy risk while preserving utility",
      "Consent and Control: Empowering data subjects with transparent, granular, and revocable choices over their personal data",
      "Privacy by Default: The strictest privacy settings apply automatically without requiring user action"
    ],
    "core_concepts_zh": [
      "数据最小化：仅收集和保留声明处理目的所严格必需的个人数据",
      "目的限制：个人数据必须为指定、明确和合法的目的收集，不得不兼容地进一步处理",
      "隐私增强技术（PET）：假名化、差分隐私、同态加密等技术措施，在保持效用的同时降低隐私风险",
      "同意与控制：赋予数据主体对其个人数据的透明、细粒度和可撤销的选择权",
      "默认隐私：最严格的隐私设置自动生效，无需用户主动操作"
    ],
    "timeline": [
      [
        "1995",
        "EU Data Protection Directive 95/46/EC introduces data protection by design concepts"
      ],
      [
        "2009",
        "Ann Cavoukian publishes the 7 Foundational Principles of Privacy by Design"
      ],
      [
        "2010",
        "International Conference of Data Protection Authorities unanimously adopts PbD as a global standard"
      ],
      [
        "2018-05",
        "GDPR takes effect with Article 25 making data protection by design and by default a legal requirement"
      ],
      [
        "2023",
        "Privacy-enhancing computation (federated learning, secure multi-party computation) goes mainstream in enterprise AI"
      ]
    ],
    "timeline_zh": [
      [
        "1995",
        "欧盟数据保护指令 95/46/EC 引入设计即数据保护的概念"
      ],
      [
        "2009",
        "Ann Cavoukian 发布隐私设计的七项基础原则"
      ],
      [
        "2010",
        "国际数据保护机构大会一致通过 PbD 作为全球标准"
      ],
      [
        "2018-05",
        "GDPR 生效，第 25 条将设计和默认的数据保护作为法律要求"
      ],
      [
        "2023",
        "隐私增强计算（联邦学习、安全多方计算）在企业 AI 中成为主流"
      ]
    ],
    "dos": [
      "Conduct privacy impact assessments before collecting any new category of personal data",
      "Implement data retention policies with automated deletion — privacy is not just about collection but also about disposal",
      "Offer users genuine controls: export, delete, opt-out — not just a privacy policy page",
      "Use privacy-preserving analytics (differential privacy, k-anonymity) instead of collecting raw granular data"
    ],
    "dos_zh": [
      "在收集任何新类别的个人数据前进行隐私影响评估",
      "实施带自动删除的数据保留策略——隐私不仅关乎收集，也关乎处置",
      "向用户提供真正的控制权：导出、删除、退出——而非仅有隐私政策页面",
      "使用隐私保护分析（差分隐私、k-匿名）替代收集原始细粒度数据"
    ],
    "donts": [
      "Don't treat consent as a blanket checkbox — GDPR requires specific, informed, and granular consent per purpose",
      "Don't collect data speculatively 'in case it's useful later' — purpose limitation forbids this",
      "Don't confuse anonymization with pseudonymization — pseudonymized data is still personal data under GDPR",
      "Don't defer privacy to a post-launch patch — regulatory fines and reputational damage are far more expensive than upfront design"
    ],
    "donts_zh": [
      "不要将同意视为笼统的勾选框——GDPR 要求按目的进行具体、知情和细粒度的同意",
      "不要以「可能以后有用」为由投机性收集数据——目的限制原则禁止此行为",
      "不要混淆匿名化与假名化——假名化数据在 GDPR 下仍属个人数据",
      "不要将隐私推迟到上线后修补——监管罚款和声誉损失远比前期设计昂贵"
    ],
    "case_study_company": "Apple",
    "case_study": "Apple has embedded Privacy by Design as a core product principle, implementing on-device processing for Siri, Photos facial recognition, and Health data, along with App Tracking Transparency (ATT) requiring explicit opt-in for cross-app tracking. The introduction of ATT in iOS 14.5 (2021) caused an estimated $10 billion revenue impact on Meta's advertising business, demonstrating that privacy-by-design decisions at the platform level reshape entire industry economics.",
    "case_study_zh": "Apple 将隐私设计作为核心产品原则，为 Siri、照片人脸识别和健康数据实施设备端处理，并推出应用跟踪透明度（ATT）要求跨应用跟踪前获取明确同意。iOS 14.5（2021）引入 ATT 对 Meta 广告业务造成约 100 亿美元收入影响，证明平台级隐私设计决策可以重塑整个行业经济格局。",
    "when_not_to_use": [
      "Systems that process zero personal data and have no user interaction (pure infrastructure tooling)",
      "Open public datasets where all information is already lawfully public and no re-identification risk exists",
      "Internal-only developer tools with no PII in any data path",
      "When the 'system' is a purely mathematical algorithm with no data ingestion"
    ],
    "when_not_to_use_zh": [
      "处理零个人数据且无用户交互的系统（纯基础设施工具）",
      "所有信息已合法公开且无重新识别风险的开放公共数据集",
      "任何数据路径中无 PII 的内部专用开发者工具",
      "「系统」是无数据摄入的纯数学算法时"
    ],
    "adopters": [
      "Apple",
      "Microsoft",
      "Google (Privacy Sandbox)",
      "European Union (GDPR mandate)",
      "Brave Browser"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "security",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Cavoukian, A. (2009). \"Privacy by Design: The 7 Foundational Principles\". Information and Privacy Commissioner of Ontario.",
    "secondary_sources": [
      "European Parliament (2016). \"General Data Protection Regulation (GDPR), Article 25: Data Protection by Design and by Default\". Official Journal of the European Union.",
      "Hoepman, J.-H. (2014). \"Privacy Design Strategies\". Proceedings of the IFIP TC 11 International Conference (SEC 2014). Springer."
    ],
    "typed_relations": [
      {
        "slug": "security-by-design",
        "type": "complement"
      },
      {
        "slug": "principle-of-least-privilege",
        "type": "complement"
      },
      {
        "slug": "defense-in-depth",
        "type": "complement"
      }
    ]
  },
  {
    "id": 116,
    "name": "OWASP Top 10",
    "name_zh": "OWASP 十大 Web 应用安全风险",
    "slug": "owasp-top-10",
    "category": "security",
    "desc": "A prioritized framework of the most critical web application security risks with proven mitigation strategies",
    "desc_zh": "最关键 Web 应用安全风险的优先级框架及经过验证的缓解策略",
    "steps": [
      "Familiarize the team with the current OWASP Top 10 categories (Broken Access Control, Cryptographic Failures, Injection, etc.)",
      "Integrate OWASP checks into the secure SDLC: code reviews, SAST, DAST, and dependency scanning against each category",
      "Implement category-specific mitigations: parameterized queries for injection, strong access control for authorization, proper TLS configuration for cryptographic failures",
      "Add OWASP-aligned test cases to the CI/CD pipeline to catch regressions automatically",
      "Review and update alignment annually as the OWASP Top 10 evolves with new data from the community"
    ],
    "steps_zh": [
      "让团队熟悉当前 OWASP Top 10 各类别（访问控制失效、加密故障、注入等）",
      "将 OWASP 检查集成到安全 SDLC 中：代码审查、SAST、DAST 和依赖扫描逐一对照各类别",
      "实施针对各类别的缓解措施：参数化查询防注入、强访问控制防授权问题、正确 TLS 配置防加密故障",
      "在 CI/CD 管道中添加 OWASP 对齐的测试用例以自动捕获回归",
      "随 OWASP Top 10 根据社区新数据演进，每年审查并更新对齐情况"
    ],
    "ai_relevant": false,
    "viz_type": "radar",
    "viz_labels": [
      "Broken Access",
      "Injection",
      "Cryptographic",
      "Misconfiguration",
      "Outdated Comp"
    ],
    "viz_labels_zh": [
      "访问控制",
      "注入攻击",
      "加密缺陷",
      "安全配置",
      "过期组件"
    ],
    "related": [
      "threat-modeling-stride",
      "security-by-design",
      "defense-in-depth"
    ],
    "tags": [
      "owasp",
      "web-security",
      "application-security",
      "vulnerabilities",
      "appsec"
    ],
    "origin_author": "OWASP Foundation (community-driven, led by Andrew van der Stock et al.)",
    "origin_source": "OWASP Top 10 — 2021 edition (owasp.org/Top10); first published 2003",
    "origin_source_zh": "OWASP Top 10 — 2021 版（owasp.org/Top10）；首次发布于 2003 年",
    "complexity": "beginner",
    "when_to_use": [
      "As a baseline security checklist for any web application or API project",
      "When training developers on application security fundamentals",
      "During security audits and penetration test scoping to ensure coverage of the most common risks",
      "As a vendor assessment criterion to verify third-party application security posture"
    ],
    "when_to_use_zh": [
      "作为任何 Web 应用或 API 项目的基线安全检查清单",
      "培训开发者应用安全基础知识时",
      "安全审计和渗透测试范围界定中确保覆盖最常见风险",
      "作为供应商评估标准验证第三方应用安全态势"
    ],
    "core_concepts": [
      "Broken Access Control: Failures that allow users to act outside their intended permissions — the #1 risk in the 2021 edition",
      "Injection: Untrusted data sent to an interpreter as part of a command or query, including SQL, NoSQL, OS, and LDAP injection",
      "Cryptographic Failures: Weaknesses in encryption, hashing, or key management that expose sensitive data",
      "Security Misconfiguration: Insecure default settings, open cloud storage, missing security headers, or verbose error messages",
      "Software and Data Integrity Failures: Code and infrastructure that don't verify integrity — including insecure CI/CD pipelines and unsigned updates"
    ],
    "core_concepts_zh": [
      "访问控制失效：允许用户在预期权限之外操作的故障——2021 版排名第一的风险",
      "注入：将不可信数据作为命令或查询的一部分发送给解释器，包括 SQL、NoSQL、OS 和 LDAP 注入",
      "加密故障：加密、哈希或密钥管理中的弱点导致敏感数据暴露",
      "安全配置错误：不安全的默认设置、开放的云存储、缺失的安全头或详细的错误消息",
      "软件和数据完整性故障：不验证完整性的代码和基础设施——包括不安全的 CI/CD 管道和未签名的更新"
    ],
    "timeline": [
      [
        "2003",
        "First OWASP Top 10 published, establishing injection and XSS as dominant web risks"
      ],
      [
        "2010",
        "OWASP Top 10 — 2010 edition adds insecure cryptographic storage and insufficient transport layer protection"
      ],
      [
        "2017",
        "OWASP Top 10 — 2017 edition introduces XML External Entities (XXE) and insecure deserialization"
      ],
      [
        "2021",
        "OWASP Top 10 — 2021 edition restructures categories: Broken Access Control rises to #1; new categories include SSRF and Software Integrity Failures"
      ],
      [
        "2023",
        "OWASP releases the Top 10 for LLM Applications, extending the framework to AI security risks"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "首版 OWASP Top 10 发布，将注入和 XSS 确立为主要 Web 风险"
      ],
      [
        "2010",
        "OWASP Top 10 — 2010 版新增不安全加密存储和不充分传输层保护"
      ],
      [
        "2017",
        "OWASP Top 10 — 2017 版引入 XML 外部实体（XXE）和不安全反序列化"
      ],
      [
        "2021",
        "OWASP Top 10 — 2021 版重构类别：访问控制失效升至第一；新增 SSRF 和软件完整性故障"
      ],
      [
        "2023",
        "OWASP 发布 LLM 应用 Top 10，将框架扩展到 AI 安全风险"
      ]
    ],
    "dos": [
      "Automate OWASP Top 10 checks with SAST/DAST tools integrated into CI/CD pipelines",
      "Use the OWASP Application Security Verification Standard (ASVS) as the detailed testing counterpart to the Top 10",
      "Train every developer on at least the Top 10 categories — security is a whole-team responsibility",
      "Map each Top 10 category to specific coding standards and pull-request review checklists"
    ],
    "dos_zh": [
      "使用集成到 CI/CD 管道的 SAST/DAST 工具自动化 OWASP Top 10 检查",
      "使用 OWASP ASVS 作为 Top 10 的详细测试对应标准",
      "培训每位开发者至少了解 Top 10 各类别——安全是全团队的责任",
      "将每个 Top 10 类别映射到具体的编码标准和 PR 审查清单"
    ],
    "donts": [
      "Don't treat the OWASP Top 10 as an exhaustive security program — it covers only the most common risks, not all risks",
      "Don't rely solely on automated scanners — many OWASP categories (broken access control, business logic flaws) require manual review",
      "Don't ignore the 'moved off the list' categories — removal from the Top 10 doesn't mean the vulnerability no longer exists",
      "Don't use the Top 10 as a penetration test methodology — it's a risk awareness framework, not a testing guide"
    ],
    "donts_zh": [
      "不要将 OWASP Top 10 视为完整的安全计划——它仅覆盖最常见的风险，而非所有风险",
      "不要仅依赖自动扫描器——许多 OWASP 类别（访问控制失效、业务逻辑缺陷）需要人工审查",
      "不要忽略「移出列表」的类别——从 Top 10 中移除不意味着该漏洞不再存在",
      "不要将 Top 10 用作渗透测试方法论——它是风险意识框架，不是测试指南"
    ],
    "case_study_company": "Equifax",
    "case_study": "The 2017 Equifax breach, which exposed 147 million records, was caused by an unpatched Apache Struts vulnerability (CVE-2017-5638) — a classic OWASP Top 10 injection flaw (A03:2021). The vulnerability had a known patch available for two months before exploitation. This incident became the canonical example of why organizations must systematically address OWASP Top 10 risks, leading to $700 million in settlements and sweeping changes to vulnerability management practices industry-wide.",
    "case_study_zh": "2017 年 Equifax 数据泄露暴露了 1.47 亿条记录，原因是未修补的 Apache Struts 漏洞（CVE-2017-5638）——典型的 OWASP Top 10 注入缺陷（A03:2021）。该漏洞在被利用前两个月已有可用补丁。此事件成为组织必须系统性应对 OWASP Top 10 风险的标志性案例，导致 7 亿美元和解金并推动全行业漏洞管理实践的深刻变革。",
    "when_not_to_use": [
      "Non-web systems (embedded firmware, desktop-only applications) where web-specific risks don't apply",
      "As the sole security standard — the Top 10 is a starting point, not a comprehensive security program",
      "When the application has no external attack surface (air-gapped, no user input)",
      "For compliance certification — use OWASP ASVS or ISO 27001 instead; the Top 10 is an awareness document"
    ],
    "when_not_to_use_zh": [
      "Web 特定风险不适用的非 Web 系统（嵌入式固件、纯桌面应用）",
      "作为唯一安全标准时——Top 10 是起点，不是完整安全计划",
      "应用无外部攻击面时（气隙环境、无用户输入）",
      "用于合规认证时——应使用 OWASP ASVS 或 ISO 27001；Top 10 是意识文档"
    ],
    "adopters": [
      "PCI Security Standards Council (references OWASP in PCI-DSS)",
      "NIST",
      "US DoD",
      "GitHub (CodeQL rules mapped to OWASP)",
      "Snyk"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "security"
    ],
    "maturity_ring": "foundational",
    "primary_source": "OWASP Foundation (2021). \"OWASP Top 10 — 2021\". owasp.org/Top10.",
    "secondary_sources": [
      "OWASP Foundation (2003). \"OWASP Top 10 — 2003 (First Edition)\". owasp.org.",
      "Stuttard, D. & Pinto, M. (2011). \"The Web Application Hacker's Handbook\", 2nd ed. Wiley."
    ],
    "typed_relations": [
      {
        "slug": "threat-modeling-stride",
        "type": "complement"
      },
      {
        "slug": "security-by-design",
        "type": "complement"
      },
      {
        "slug": "defense-in-depth",
        "type": "complement"
      }
    ]
  },
  {
    "id": 117,
    "name": "Principle of Least Privilege",
    "name_zh": "最小权限原则",
    "slug": "principle-of-least-privilege",
    "category": "security",
    "desc": "Grant each subject only the minimum permissions necessary to perform its function, nothing more",
    "desc_zh": "仅授予每个主体执行其功能所需的最小权限，不多不少",
    "steps": [
      "Inventory all subjects (users, services, processes, CI/CD pipelines) and the resources they access",
      "Define the minimum permissions each subject needs based on its actual job function or service contract",
      "Implement role-based (RBAC) or attribute-based (ABAC) access control and assign the scoped-down roles",
      "Remove standing privileges: use just-in-time (JIT) access elevation with time-bound approvals for sensitive operations",
      "Audit access logs regularly, detect privilege creep, and revoke unused permissions automatically"
    ],
    "steps_zh": [
      "盘点所有主体（用户、服务、进程、CI/CD 管道）及其访问的资源",
      "根据实际工作职能或服务契约定义每个主体所需的最小权限",
      "实施基于角色（RBAC）或基于属性（ABAC）的访问控制并分配缩减后的角色",
      "消除常驻权限：对敏感操作使用即时（JIT）访问提升和限时审批",
      "定期审计访问日志，检测权限蠕变，自动撤销未使用的权限"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Inventory",
      "Min Permissions",
      "RBAC / ABAC",
      "JIT Access",
      "Audit"
    ],
    "viz_labels_zh": [
      "资产清单",
      "最小权限",
      "访问控制",
      "即时访问",
      "权限审计"
    ],
    "related": [
      "zero-trust-architecture",
      "oauth2-openid-connect",
      "defense-in-depth"
    ],
    "tags": [
      "least-privilege",
      "access-control",
      "rbac",
      "abac",
      "authorization",
      "iam"
    ],
    "origin_author": "Jerome Saltzer and Michael Schroeder, 1975",
    "origin_source": "Saltzer & Schroeder「The Protection of Information in Computer Systems」(Proceedings of the IEEE, 1975); NIST SP 800-53 AC-6「Least Privilege」",
    "origin_source_zh": "Saltzer & Schroeder「计算机系统中的信息保护」(IEEE 会刊, 1975)；NIST SP 800-53 AC-6「最小权限」",
    "complexity": "beginner",
    "when_to_use": [
      "Designing IAM policies for cloud infrastructure (AWS IAM, Azure RBAC, GCP IAM)",
      "Configuring service accounts and API keys that access shared resources",
      "Defining database roles to restrict queries, writes, and schema changes per application tier",
      "Setting up CI/CD pipeline permissions to limit what build agents can access in production"
    ],
    "when_to_use_zh": [
      "为云基础设施设计 IAM 策略时（AWS IAM、Azure RBAC、GCP IAM）",
      "配置访问共享资源的服务账户和 API 密钥时",
      "定义数据库角色以按应用层限制查询、写入和模式变更时",
      "设置 CI/CD 管道权限以限制构建代理在生产环境中的访问范围时"
    ],
    "core_concepts": [
      "Need-to-Know: Access to information is restricted to subjects who require it for their current task",
      "Role-Based Access Control (RBAC): Permissions are assigned to roles, and subjects are assigned to roles, creating an indirection layer",
      "Just-In-Time (JIT) Access: Elevated privileges are granted temporarily and automatically revoked after a time window",
      "Privilege Creep: The gradual accumulation of access rights over time as subjects change roles without having old permissions revoked",
      "Blast Radius: The scope of damage a compromised subject can cause, directly proportional to its granted permissions"
    ],
    "core_concepts_zh": [
      "按需知悉：信息访问仅限于当前任务需要它的主体",
      "基于角色的访问控制（RBAC）：权限分配给角色，主体分配到角色，创建间接层",
      "即时访问（JIT）：临时授予提升的权限，在时间窗口后自动撤销",
      "权限蠕变：随着主体变更角色而旧权限未被撤销，访问权限逐渐积累",
      "爆炸半径：被入侵主体可造成的损害范围，与其被授予的权限直接成正比"
    ],
    "timeline": [
      [
        "1975",
        "Saltzer and Schroeder define the principle of least privilege in their seminal IEEE paper"
      ],
      [
        "2004",
        "Windows XP SP2 introduces User Account Control concepts, applying least privilege to desktop users"
      ],
      [
        "2011",
        "AWS IAM launches with fine-grained policy-based access control, making least privilege practical in the cloud"
      ],
      [
        "2019",
        "Cloud providers introduce IAM Access Analyzer (AWS) and similar tools to detect over-privileged identities"
      ],
      [
        "2023",
        "Just-in-time access platforms (CyberArk, BeyondTrust, HashiCorp Boundary) become standard in enterprise security stacks"
      ]
    ],
    "timeline_zh": [
      [
        "1975",
        "Saltzer 和 Schroeder 在开创性 IEEE 论文中定义最小权限原则"
      ],
      [
        "2004",
        "Windows XP SP2 引入用户账户控制概念，将最小权限应用于桌面用户"
      ],
      [
        "2011",
        "AWS IAM 推出细粒度基于策略的访问控制，使最小权限在云端变得实用"
      ],
      [
        "2019",
        "云厂商推出 IAM Access Analyzer（AWS）等工具检测过度授权的身份"
      ],
      [
        "2023",
        "即时访问平台（CyberArk、BeyondTrust、HashiCorp Boundary）成为企业安全栈的标配"
      ]
    ],
    "dos": [
      "Default to deny-all and explicitly grant only needed permissions — whitelisting over blacklisting",
      "Use infrastructure-as-code (Terraform, Pulumi) to codify and review IAM policies in pull requests",
      "Implement automated permission review cycles (quarterly) to catch privilege creep",
      "Separate duties: ensure no single identity can both deploy code and approve production access"
    ],
    "dos_zh": [
      "默认拒绝全部，仅显式授予所需权限——白名单优于黑名单",
      "使用基础设施即代码（Terraform、Pulumi）将 IAM 策略编码化并在 PR 中审查",
      "实施自动化权限审查周期（季度），捕获权限蠕变",
      "职责分离：确保没有单一身份既能部署代码又能批准生产访问"
    ],
    "donts": [
      "Don't use wildcard permissions (e.g., '*' in AWS IAM) in production — they negate the entire principle",
      "Don't share service accounts across multiple applications — each service needs its own scoped identity",
      "Don't grant permanent admin access to developers — use JIT elevation with MFA for administrative tasks",
      "Don't assume least privilege is a one-time setup — permissions must be continuously reviewed as roles evolve"
    ],
    "donts_zh": [
      "不要在生产中使用通配符权限（如 AWS IAM 中的 '*'）——这会否定整个原则",
      "不要跨多个应用共享服务账户——每个服务需要自己的限定范围身份",
      "不要给开发者永久管理员权限——管理任务应使用带 MFA 的 JIT 提升",
      "不要假设最小权限是一次性设置——权限必须随角色演变持续审查"
    ],
    "case_study_company": "Capital One",
    "case_study": "The 2019 Capital One breach exposed 106 million customer records because a misconfigured WAF role had excessive IAM permissions, allowing the attacker to access S3 buckets containing sensitive data. The root cause was a violation of least privilege: the WAF's IAM role could list and read arbitrary S3 buckets rather than being scoped to only the resources it needed. Post-breach, Capital One invested heavily in automated IAM policy analysis and JIT access controls.",
    "case_study_zh": "2019 年 Capital One 数据泄露暴露了 1.06 亿客户记录，原因是 WAF 角色配置错误拥有过度的 IAM 权限，允许攻击者访问包含敏感数据的 S3 存储桶。根本原因是违反最小权限原则：WAF 的 IAM 角色可以列出和读取任意 S3 存储桶，而非仅限于所需资源。事后 Capital One 大力投资于自动化 IAM 策略分析和即时访问控制。",
    "when_not_to_use": [
      "Personal development environments or sandboxes where broad permissions accelerate experimentation",
      "Emergency break-glass scenarios where rapid access to all systems is required — but these should be audited and time-limited",
      "Extremely small projects with a single administrator where role separation is impractical",
      "When the cost of implementing fine-grained access control exceeds the value of the assets being protected"
    ],
    "when_not_to_use_zh": [
      "宽泛权限可加速实验的个人开发环境或沙箱",
      "需要快速访问所有系统的紧急破玻璃场景——但应审计且限时",
      "角色分离不切实际的单管理员极小项目",
      "实施细粒度访问控制的成本超过被保护资产价值时"
    ],
    "adopters": [
      "AWS (IAM least privilege recommendations)",
      "Google Cloud (IAM Recommender)",
      "Microsoft (Azure AD PIM)",
      "HashiCorp (Vault, Boundary)",
      "CyberArk"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "security",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Saltzer, J.H. & Schroeder, M.D. (1975). \"The Protection of Information in Computer Systems\". Proceedings of the IEEE, 63(9).",
    "secondary_sources": [
      "NIST (2020). \"Security and Privacy Controls for Information Systems and Organizations, AC-6: Least Privilege\". NIST SP 800-53, Rev. 5.",
      "Anderson, R. (2020). \"Security Engineering\", 3rd ed. Wiley."
    ],
    "typed_relations": [
      {
        "slug": "zero-trust-architecture",
        "type": "extends"
      },
      {
        "slug": "oauth2-openid-connect",
        "type": "complement"
      },
      {
        "slug": "defense-in-depth",
        "type": "complement"
      }
    ]
  },
  {
    "id": 118,
    "name": "Security by Design",
    "name_zh": "安全设计",
    "slug": "security-by-design",
    "category": "security",
    "desc": "Integrate security considerations into every phase of the software development lifecycle from requirements through deployment",
    "desc_zh": "将安全考量集成到从需求到部署的软件开发生命周期的每个阶段",
    "steps": [
      "Elicit security requirements alongside functional requirements: define abuse cases, compliance needs, and trust assumptions",
      "Apply secure design patterns: input validation, secure defaults, fail-secure behavior, and separation of privilege",
      "Implement secure coding practices: parameterized queries, output encoding, safe memory management, dependency scanning",
      "Integrate security testing into CI/CD: SAST, DAST, SCA, secrets scanning, and fuzz testing on every build",
      "Conduct security reviews (architecture review, code audit, penetration test) at each milestone and feed findings back into the design"
    ],
    "steps_zh": [
      "在功能需求旁引出安全需求：定义滥用用例、合规需要和信任假设",
      "应用安全设计模式：输入验证、安全默认值、安全失败行为和权限分离",
      "实施安全编码实践：参数化查询、输出编码、安全内存管理、依赖扫描",
      "将安全测试集成到 CI/CD：SAST、DAST、SCA、密钥扫描和模糊测试在每次构建中运行",
      "在每个里程碑进行安全审查（架构审查、代码审计、渗透测试）并将发现反馈到设计中"
    ],
    "ai_relevant": false,
    "viz_type": "cycle",
    "viz_labels": [
      "Requirements",
      "Secure Design",
      "Secure Coding",
      "CI Security",
      "Security Review"
    ],
    "viz_labels_zh": [
      "安全需求",
      "安全设计",
      "安全编码",
      "CI安全",
      "安全审查"
    ],
    "related": [
      "threat-modeling-stride",
      "owasp-top-10",
      "privacy-by-design"
    ],
    "tags": [
      "secure-sdlc",
      "security-by-design",
      "shift-left",
      "secure-coding",
      "devsecops"
    ],
    "origin_author": "Dan Bergh Johnsson, Daniel Deogun, Daniel Sawano (book); Microsoft SDL (process)",
    "origin_source": "Johnsson, Deogun & Sawano「Secure by Design」(Manning, 2019); Microsoft Security Development Lifecycle (SDL); NIST SP 800-160「Systems Security Engineering」",
    "origin_source_zh": "Johnsson, Deogun & Sawano「Secure by Design」(Manning, 2019)；微软安全开发生命周期（SDL）；NIST SP 800-160「系统安全工程」",
    "complexity": "intermediate",
    "when_to_use": [
      "Greenfield projects where security can be baked in from day one at minimal marginal cost",
      "Organizations adopting DevSecOps and shifting security left in the development pipeline",
      "Products in regulated industries (finance, healthcare, defense) where secure SDLC is mandated",
      "After security incidents reveal that vulnerabilities were introduced during design or implementation phases"
    ],
    "when_to_use_zh": [
      "可以从第一天以最小边际成本植入安全的新建项目",
      "采用 DevSecOps 并将安全左移到开发管道的组织",
      "安全 SDLC 是强制要求的受监管行业（金融、医疗、国防）产品",
      "安全事件揭示漏洞是在设计或实现阶段引入后"
    ],
    "core_concepts": [
      "Secure Defaults: Systems ship with the most secure configuration out of the box; users must explicitly opt into less secure settings",
      "Input Validation: All data entering the system is treated as untrusted and validated against strict schemas before processing",
      "Fail-Secure: When a component fails, it denies access rather than failing open — preventing security bypass through error conditions",
      "Domain-Driven Security: Using the type system and domain primitives to make invalid or insecure states unrepresentable in code",
      "Shift Left: Moving security activities (threat modeling, SAST, code review) earlier in the SDLC where fixes are cheapest"
    ],
    "core_concepts_zh": [
      "安全默认值：系统以最安全的配置出厂；用户必须显式选择降低安全设置",
      "输入验证：进入系统的所有数据被视为不可信，在处理前按严格模式验证",
      "安全失败：组件故障时拒绝访问而非开放失败——防止通过错误条件绕过安全",
      "领域驱动安全：使用类型系统和领域原语使无效或不安全状态在代码中不可表达",
      "左移：将安全活动（威胁建模、SAST、代码审查）前移到修复成本最低的 SDLC 早期"
    ],
    "timeline": [
      [
        "2004",
        "Microsoft launches the Security Development Lifecycle (SDL) after the Trustworthy Computing memo"
      ],
      [
        "2008",
        "Gary McGraw publishes「Software Security: Building Security In」, establishing the building security in movement"
      ],
      [
        "2016",
        "DevSecOps movement formalizes security integration into CI/CD pipelines with automated tooling"
      ],
      [
        "2019",
        "Johnsson, Deogun & Sawano publish「Secure by Design」, introducing domain-driven security patterns"
      ],
      [
        "2022",
        "CISA launches the Secure by Design initiative, urging software vendors to take responsibility for product security"
      ]
    ],
    "timeline_zh": [
      [
        "2004",
        "微软在「可信计算」备忘录后启动安全开发生命周期（SDL）"
      ],
      [
        "2008",
        "Gary McGraw 出版「Software Security: Building Security In」，建立内建安全运动"
      ],
      [
        "2016",
        "DevSecOps 运动通过自动化工具将安全集成正式化到 CI/CD 管道中"
      ],
      [
        "2019",
        "Johnsson、Deogun 和 Sawano 出版「Secure by Design」，引入领域驱动安全模式"
      ],
      [
        "2022",
        "CISA 发起 Secure by Design 倡议，敦促软件供应商对产品安全负责"
      ]
    ],
    "dos": [
      "Use strong type systems and domain primitives to encode security invariants — let the compiler catch violations",
      "Automate security gates in CI/CD: fail the build on critical SAST findings or known vulnerable dependencies",
      "Maintain a security champions program — embed security-minded developers in every team",
      "Treat security bugs with the same priority as functional bugs — they are not second-class issues"
    ],
    "dos_zh": [
      "使用强类型系统和领域原语编码安全不变量——让编译器捕获违规",
      "在 CI/CD 中自动化安全门禁：在关键 SAST 发现或已知漏洞依赖时构建失败",
      "维护安全冠军计划——在每个团队中嵌入有安全意识的开发者",
      "以与功能缺陷同等优先级对待安全缺陷——它们不是二等问题"
    ],
    "donts": [
      "Don't bolt on security as a final phase — the cost of fixing vulnerabilities increases 30x from design to production",
      "Don't rely solely on penetration testing — it finds symptoms, not root causes in the design",
      "Don't assume frameworks handle all security — ORM doesn't prevent all injection; HTTPS doesn't prevent all data exposure",
      "Don't treat security as solely the security team's responsibility — every developer writes security-sensitive code"
    ],
    "donts_zh": [
      "不要将安全作为最终阶段附加——修复漏洞的成本从设计到生产增加 30 倍",
      "不要仅依赖渗透测试——它发现症状而非设计中的根本原因",
      "不要假设框架处理所有安全——ORM 不能防止所有注入；HTTPS 不能防止所有数据泄露",
      "不要把安全视为仅安全团队的责任——每个开发者都在编写安全敏感代码"
    ],
    "case_study_company": "Microsoft",
    "case_study": "Microsoft's Security Development Lifecycle (SDL), mandated company-wide after Bill Gates' 2002 Trustworthy Computing memo, transformed how the company builds software. By requiring threat modeling, static analysis, fuzz testing, and security reviews at defined checkpoints, Microsoft reduced the vulnerability count in Windows by over 50% between Windows XP and Windows 7. The SDL became the template for secure SDLC practices adopted across the industry.",
    "case_study_zh": "微软的安全开发生命周期（SDL）在 2002 年比尔·盖茨「可信计算」备忘录后在全公司强制推行，彻底改变了公司构建软件的方式。通过要求在规定检查点进行威胁建模、静态分析、模糊测试和安全审查，微软将 Windows 从 XP 到 Windows 7 的漏洞数量减少了 50% 以上。SDL 成为全行业采用的安全 SDLC 实践模板。",
    "when_not_to_use": [
      "Disposable prototypes built purely to validate a business hypothesis with zero real user data",
      "When retrofitting security into a massive legacy codebase — start with threat modeling and risk-based prioritization instead of full redesign",
      "Extremely time-constrained emergency patches where the priority is stopping active exploitation",
      "Open-source hobby projects with no users and no sensitive data"
    ],
    "when_not_to_use_zh": [
      "仅用于验证商业假设、零真实用户数据的一次性原型",
      "为大型遗留代码库补装安全时——应从威胁建模和基于风险的优先级排序开始，而非全面重设计",
      "优先级是阻止活跃利用的极度时间紧迫的紧急补丁",
      "无用户和无敏感数据的开源业余项目"
    ],
    "adopters": [
      "Microsoft (SDL)",
      "Google (Project Zero drives secure-by-design improvements)",
      "CISA (Secure by Design initiative)",
      "SAFECode",
      "Cisco (Cisco Secure Development Lifecycle)"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "security",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Johnsson, D.B., Deogun, D. & Sawano, D. (2019). \"Secure by Design\". Manning Publications.",
    "secondary_sources": [
      "Microsoft (2010). \"Security Development Lifecycle (SDL)\". microsoft.com/sdl.",
      "NIST (2018). \"Systems Security Engineering: Considerations for a Multidisciplinary Approach in the Engineering of Trustworthy Secure Systems\". NIST SP 800-160, Vol. 1."
    ],
    "typed_relations": [
      {
        "slug": "threat-modeling-stride",
        "type": "complement"
      },
      {
        "slug": "owasp-top-10",
        "type": "complement"
      },
      {
        "slug": "privacy-by-design",
        "type": "complement"
      }
    ]
  },
  {
    "id": 119,
    "name": "Supply Chain Security (SLSA)",
    "name_zh": "软件供应链安全（SLSA）",
    "slug": "supply-chain-security-slsa",
    "category": "security",
    "desc": "Ensure the integrity and provenance of software artifacts through verifiable supply chain levels",
    "desc_zh": "通过可验证的供应链级别确保软件制品的完整性和来源",
    "steps": [
      "Establish source integrity: require code review, enforce branch protection, and sign commits in version control",
      "Secure the build process: use hermetic, reproducible builds on isolated infrastructure with build provenance attestations",
      "Generate and distribute SLSA provenance metadata (who built what, from which source, on which builder) as signed attestations",
      "Verify provenance at every consumption point: package registries, container registries, and deployment pipelines must check attestations",
      "Progress through SLSA levels (L1-L4) incrementally, adding automation and hardening at each level"
    ],
    "steps_zh": [
      "建立源代码完整性：要求代码审查、强制分支保护并在版本控制中签名提交",
      "保护构建过程：使用隔离基础设施上的封闭、可复现构建并生成构建来源证明",
      "生成并分发 SLSA 来源元数据（谁在哪个构建器上从哪个源代码构建了什么）作为签名证明",
      "在每个消费点验证来源：包注册中心、容器注册中心和部署管道必须检查证明",
      "逐步提升 SLSA 级别（L1-L4），在每个级别增加自动化和加固"
    ],
    "ai_relevant": false,
    "viz_type": "pyramid",
    "viz_labels": [
      "Source Integrity",
      "Secure Build",
      "Provenance",
      "Verify Provenance",
      "SLSA Levels"
    ],
    "viz_labels_zh": [
      "源码完整性",
      "安全构建",
      "供应链证明",
      "验证来源",
      "SLSA等级"
    ],
    "related": [
      "security-by-design",
      "owasp-top-10",
      "defense-in-depth"
    ],
    "tags": [
      "supply-chain",
      "slsa",
      "provenance",
      "sbom",
      "software-integrity",
      "sigstore"
    ],
    "origin_author": "Google; Linux Foundation / OpenSSF",
    "origin_source": "SLSA specification (slsa.dev) by Google and OpenSSF (2021); US Executive Order 14028 on Improving the Nation's Cybersecurity (2021); NIST SP 800-218「Secure Software Development Framework」",
    "origin_source_zh": "SLSA 规范（slsa.dev）由 Google 和 OpenSSF 发布 (2021)；美国第 14028 号行政令「改善国家网络安全」(2021)；NIST SP 800-218「安全软件开发框架」",
    "complexity": "advanced",
    "when_to_use": [
      "Organizations with complex dependency trees (hundreds of open-source libraries) needing to verify artifact provenance",
      "CI/CD pipelines producing artifacts consumed by external customers or deployed to critical infrastructure",
      "After a supply chain attack (SolarWinds, Log4Shell, xz-utils) motivates investment in provenance verification",
      "Regulated environments requiring Software Bill of Materials (SBOM) and artifact traceability"
    ],
    "when_to_use_zh": [
      "拥有复杂依赖树（数百个开源库）需要验证制品来源的组织",
      "CI/CD 管道产出的制品供外部客户使用或部署到关键基础设施",
      "供应链攻击（SolarWinds、Log4Shell、xz-utils）促使投资来源验证之后",
      "要求软件物料清单（SBOM）和制品可追溯性的受监管环境"
    ],
    "core_concepts": [
      "Provenance: Verifiable metadata describing who built an artifact, from what source, using which build platform, and what process was followed",
      "SLSA Levels: A graduated maturity model (L1: documented build; L2: hosted build with signed provenance; L3: hardened build platform; L4: hermetic, reproducible, two-person review)",
      "Software Bill of Materials (SBOM): A machine-readable inventory of all components and dependencies in a software artifact",
      "Hermetic Build: A build process that is fully isolated from the network and external state, ensuring reproducibility",
      "Sigstore: An open-source project providing keyless signing, transparency logs (Rekor), and certificate authority (Fulcio) for artifact attestation"
    ],
    "core_concepts_zh": [
      "来源证明：描述谁构建了制品、来自什么源代码、使用哪个构建平台以及遵循什么流程的可验证元数据",
      "SLSA 级别：渐进成熟度模型（L1：文档化构建；L2：托管构建 + 签名来源；L3：加固构建平台；L4：封闭可复现 + 双人审查）",
      "软件物料清单（SBOM）：软件制品中所有组件和依赖的机器可读清单",
      "封闭构建：完全与网络和外部状态隔离的构建过程，确保可复现性",
      "Sigstore：提供无密钥签名、透明日志（Rekor）和证书颁发机构（Fulcio）的开源制品证明项目"
    ],
    "timeline": [
      [
        "2020-12",
        "SolarWinds supply chain attack compromises 18,000 organizations, catalyzing industry focus on build integrity"
      ],
      [
        "2021-05",
        "US Executive Order 14028 mandates SBOM and secure software supply chain practices for federal suppliers"
      ],
      [
        "2021-06",
        "Google open-sources the SLSA framework (v0.1) defining four levels of supply chain integrity"
      ],
      [
        "2022-10",
        "Sigstore reaches general availability, providing free keyless signing for open-source artifacts"
      ],
      [
        "2024-03",
        "xz-utils backdoor (CVE-2024-3094) demonstrates social engineering risks in open-source supply chains; SLSA L3+ would have mitigated it"
      ]
    ],
    "timeline_zh": [
      [
        "2020-12",
        "SolarWinds 供应链攻击危及 18,000 个组织，催化行业对构建完整性的关注"
      ],
      [
        "2021-05",
        "美国第 14028 号行政令要求联邦供应商实施 SBOM 和安全软件供应链实践"
      ],
      [
        "2021-06",
        "Google 开源 SLSA 框架（v0.1），定义四个供应链完整性级别"
      ],
      [
        "2022-10",
        "Sigstore 正式发布，为开源制品提供免费无密钥签名"
      ],
      [
        "2024-03",
        "xz-utils 后门（CVE-2024-3094）展示开源供应链中的社会工程风险；SLSA L3+ 可以缓解该攻击"
      ]
    ],
    "dos": [
      "Start with SLSA Level 1 (documented build process) and progress incrementally — perfection is not required to gain value",
      "Generate SBOMs automatically during builds and publish them alongside release artifacts",
      "Use Sigstore or equivalent tooling for artifact signing — keyless signing removes the key management burden",
      "Pin dependencies by hash (not just version) and verify checksums in CI to detect tampering"
    ],
    "dos_zh": [
      "从 SLSA Level 1（文档化构建流程）开始逐步推进——不需要完美即可获得价值",
      "在构建期间自动生成 SBOM 并与发布制品一起发布",
      "使用 Sigstore 或等效工具进行制品签名——无密钥签名消除密钥管理负担",
      "通过哈希（不仅是版本）固定依赖并在 CI 中验证校验和以检测篡改"
    ],
    "donts": [
      "Don't assume your CI/CD platform is inherently trusted — build infrastructure itself is an attack surface",
      "Don't rely on dependency version pinning alone — versions can be republished with different content on some registries",
      "Don't ignore transitive dependencies — most supply chain attacks target deep, overlooked dependencies",
      "Don't treat SBOM generation as compliance theater — if no one consumes the SBOM for policy decisions, it's just a file"
    ],
    "donts_zh": [
      "不要假设 CI/CD 平台天然可信——构建基础设施本身就是攻击面",
      "不要仅依赖依赖版本固定——某些注册中心上版本可能以不同内容重新发布",
      "不要忽略传递依赖——大多数供应链攻击针对深层、被忽视的依赖",
      "不要将 SBOM 生成视为合规面子工程——如果无人消费 SBOM 做策略决策，它只是一个文件"
    ],
    "case_study_company": "Google",
    "case_study": "Google developed SLSA from its internal Binary Authorization for Borg (BAB) system, which has protected Google's production workloads since 2013 by requiring cryptographic provenance verification before any binary can run. When open-sourced as SLSA in 2021, the framework provided the industry with a graduated path toward the same supply chain integrity guarantees. Google mandates SLSA Level 3 for all internal production software and contributed Sigstore to make artifact signing accessible to the entire open-source ecosystem.",
    "case_study_zh": "Google 从内部的 Binary Authorization for Borg（BAB）系统发展出 SLSA，该系统自 2013 年起通过在任何二进制文件运行前要求加密来源验证来保护 Google 的生产工作负载。2021 年开源为 SLSA 后，该框架为行业提供了通向同等供应链完整性保证的渐进路径。Google 对所有内部生产软件强制要求 SLSA Level 3，并贡献了 Sigstore 使制品签名对整个开源生态可及。",
    "when_not_to_use": [
      "Single-developer personal projects with no external consumers of the build artifacts",
      "Early prototyping phases where the dependency set changes daily and provenance verification adds friction without risk reduction",
      "Environments where all software is built and consumed entirely within a physically secured enclave with no external dependencies",
      "When the team has not yet adopted basic CI/CD — establish a consistent build pipeline first before adding supply chain verification"
    ],
    "when_not_to_use_zh": [
      "无外部构建制品消费者的单人个人项目",
      "依赖集每日变化、来源验证增加摩擦而不降低风险的早期原型阶段",
      "所有软件完全在物理安全飞地内构建和消费、无外部依赖的环境",
      "团队尚未采用基本 CI/CD 时——应先建立一致的构建管道再添加供应链验证"
    ],
    "adopters": [
      "Google",
      "GitHub (Artifact Attestations)",
      "npm (provenance for packages)",
      "Kubernetes (SLSA L3 for releases)",
      "OpenSSF / Linux Foundation"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "security",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Google & OpenSSF (2021). \"SLSA: Supply-chain Levels for Software Artifacts\". slsa.dev.",
    "secondary_sources": [
      "The White House (2021). \"Executive Order 14028 on Improving the Nation's Cybersecurity\". Federal Register.",
      "NIST (2022). \"Secure Software Development Framework (SSDF)\". NIST SP 800-218.",
      "in-toto Project (2019). \"in-toto: Providing Farm-to-Table Guarantees for Bits and Bytes\". Proceedings of USENIX Security Symposium."
    ],
    "typed_relations": [
      {
        "slug": "security-by-design",
        "type": "extends"
      },
      {
        "slug": "owasp-top-10",
        "type": "complement"
      },
      {
        "slug": "defense-in-depth",
        "type": "complement"
      }
    ]
  },
  {
    "id": 120,
    "name": "Confidential Computing",
    "name_zh": "机密计算",
    "slug": "confidential-computing",
    "category": "security",
    "desc": "Protect data in use by performing computation within hardware-based trusted execution environments (TEEs)",
    "desc_zh": "通过在基于硬件的可信执行环境（TEE）中执行计算来保护使用中的数据",
    "steps": [
      "Identify workloads that process sensitive data in memory and are vulnerable to privileged-attacker threats (rogue admins, compromised hypervisors)",
      "Select a TEE platform: Intel SGX enclaves, Intel TDX, AMD SEV-SNP confidential VMs, or Arm CCA",
      "Design the application to partition sensitive processing into the enclave/TEE and keep the trusted computing base (TCB) minimal",
      "Implement remote attestation: cryptographically verify the TEE's identity, firmware version, and code measurements before sending sensitive data",
      "Deploy, monitor, and maintain the confidential workload: handle attestation failures, manage sealed storage, and update enclave code with re-attestation"
    ],
    "steps_zh": [
      "识别在内存中处理敏感数据且易受特权攻击者威胁（恶意管理员、被入侵的虚拟机管理程序）的工作负载",
      "选择 TEE 平台：Intel SGX 安全飞地、Intel TDX、AMD SEV-SNP 机密虚拟机或 Arm CCA",
      "设计应用将敏感处理分区到飞地/TEE 中，保持可信计算基（TCB）最小化",
      "实施远程证明：在发送敏感数据前加密验证 TEE 的身份、固件版本和代码度量",
      "部署、监控和维护机密工作负载：处理证明失败、管理密封存储并通过重新证明更新飞地代码"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Sensitive Workload",
      "TEE Platform",
      "Partition Design",
      "Remote Attestation",
      "Monitor"
    ],
    "viz_labels_zh": [
      "敏感工作负载",
      "可信执行环境",
      "隔离设计",
      "远程认证",
      "部署监控"
    ],
    "related": [
      "zero-trust-architecture",
      "defense-in-depth",
      "privacy-by-design"
    ],
    "tags": [
      "confidential-computing",
      "tee",
      "enclave",
      "attestation",
      "data-in-use",
      "hardware-security"
    ],
    "origin_author": "Confidential Computing Consortium (Linux Foundation); Intel, AMD, Arm",
    "origin_source": "Confidential Computing Consortium「A Technical Analysis of Confidential Computing」(2021); Intel SGX documentation; AMD SEV-SNP whitepaper; NIST IR 8320「Hardware-Enabled Security」",
    "origin_source_zh": "机密计算联盟「机密计算技术分析」(2021)；Intel SGX 文档；AMD SEV-SNP 白皮书；NIST IR 8320「硬件使能安全」",
    "complexity": "advanced",
    "when_to_use": [
      "Multi-party computation scenarios where mutually distrustful parties need to process shared data without revealing it to each other",
      "Cloud workloads processing highly sensitive data where the cloud provider must be excluded from the trust boundary",
      "AI/ML inference on private data (healthcare, finance) where model inputs must be protected even from the infrastructure operator",
      "Key management and cryptographic operations where private keys must never exist in unencrypted memory accessible to the host OS"
    ],
    "when_to_use_zh": [
      "互不信任的多方需要处理共享数据而不向对方泄露的多方计算场景",
      "处理高度敏感数据且必须将云提供商排除在信任边界外的云工作负载",
      "在私有数据（医疗、金融）上进行 AI/ML 推理，模型输入必须对基础设施运营者保密",
      "密钥管理和加密操作中私钥决不能以未加密形式存在于主机 OS 可访问的内存中"
    ],
    "core_concepts": [
      "Trusted Execution Environment (TEE): A hardware-isolated processing area where code and data are protected from the host OS, hypervisor, and other tenants",
      "Remote Attestation: A cryptographic protocol that allows a remote party to verify the identity, integrity, and configuration of a TEE before entrusting it with sensitive data",
      "Trusted Computing Base (TCB): The minimal set of hardware and software components that must be trusted — smaller TCB means smaller attack surface",
      "Memory Encryption: Hardware-level encryption of the TEE's memory pages, preventing physical and software-based memory snooping attacks",
      "Sealed Storage: Data encrypted to a specific TEE's identity so that only that enclave (or an authorized successor) can decrypt it"
    ],
    "core_concepts_zh": [
      "可信执行环境（TEE）：硬件隔离的处理区域，其中代码和数据受保护免受主机 OS、虚拟机管理程序和其他租户的访问",
      "远程证明：允许远程方在委托敏感数据前验证 TEE 身份、完整性和配置的加密协议",
      "可信计算基（TCB）：必须被信任的最小硬件和软件组件集——更小的 TCB 意味着更小的攻击面",
      "内存加密：TEE 内存页的硬件级加密，防止物理和基于软件的内存窥探攻击",
      "密封存储：加密到特定 TEE 身份的数据，仅该飞地（或授权继任者）可解密"
    ],
    "timeline": [
      [
        "2015",
        "Intel launches SGX (Software Guard Extensions) in 6th-gen Core processors, introducing user-space enclaves"
      ],
      [
        "2019",
        "Confidential Computing Consortium founded under the Linux Foundation by Intel, Microsoft, Google, Arm, and others"
      ],
      [
        "2020",
        "AMD SEV-SNP (Secure Nested Paging) adds strong VM-level isolation, making confidential VMs practical"
      ],
      [
        "2022",
        "Azure, GCP, and AWS launch confidential VM offerings based on AMD SEV-SNP and Intel TDX"
      ],
      [
        "2024",
        "Arm CCA (Confidential Compute Architecture) extends confidential computing to mobile and edge devices; NVIDIA H100 adds GPU TEE support"
      ]
    ],
    "timeline_zh": [
      [
        "2015",
        "Intel 在第六代 Core 处理器中推出 SGX（软件保护扩展），引入用户态飞地"
      ],
      [
        "2019",
        "机密计算联盟在 Linux 基金会下由 Intel、Microsoft、Google、Arm 等发起成立"
      ],
      [
        "2020",
        "AMD SEV-SNP（安全嵌套分页）添加强 VM 级隔离，使机密虚拟机切实可行"
      ],
      [
        "2022",
        "Azure、GCP 和 AWS 推出基于 AMD SEV-SNP 和 Intel TDX 的机密 VM 产品"
      ],
      [
        "2024",
        "Arm CCA（机密计算架构）将机密计算扩展到移动和边缘设备；NVIDIA H100 添加 GPU TEE 支持"
      ]
    ],
    "dos": [
      "Always verify remote attestation before sending sensitive data to a TEE — attestation is the foundation of trust",
      "Minimize the trusted computing base: smaller enclave code means fewer bugs and a smaller attack surface",
      "Use well-audited TEE SDKs (Open Enclave, Gramine, Enarx) rather than building enclave runtimes from scratch",
      "Plan for side-channel mitigations: constant-time algorithms, ORAM access patterns, and compiler-level defenses"
    ],
    "dos_zh": [
      "在向 TEE 发送敏感数据前始终验证远程证明——证明是信任的基础",
      "最小化可信计算基：更小的飞地代码意味着更少的缺陷和更小的攻击面",
      "使用经过审计的 TEE SDK（Open Enclave、Gramine、Enarx）而非从零构建飞地运行时",
      "规划侧信道缓解：常量时间算法、ORAM 访问模式和编译器级防御"
    ],
    "donts": [
      "Don't assume TEEs are invulnerable — side-channel attacks (Spectre, Foreshadow, Plundervolt) have repeatedly broken SGX guarantees",
      "Don't put the entire application inside the enclave — only the security-sensitive processing belongs there",
      "Don't skip firmware and microcode updates — TEE security depends on patches for known hardware vulnerabilities",
      "Don't treat attestation as a one-time check — re-attest periodically and after any TEE restart or migration"
    ],
    "donts_zh": [
      "不要假设 TEE 无懈可击——侧信道攻击（Spectre、Foreshadow、Plundervolt）已多次突破 SGX 保证",
      "不要将整个应用放入飞地——仅安全敏感处理属于其中",
      "不要跳过固件和微码更新——TEE 安全依赖于已知硬件漏洞的补丁",
      "不要将证明视为一次性检查——应定期重新证明，并在 TEE 重启或迁移后重新证明"
    ],
    "case_study_company": "Signal",
    "case_study": "Signal implemented Secure Value Recovery (SVR) using Intel SGX enclaves to protect user PIN-derived keys on the server side. By running the PIN verification and key derivation inside an enclave, Signal ensures that even a compromised server or a rogue Signal employee cannot extract the encryption keys protecting users' message history and contacts. Remote attestation allows Signal clients to verify the enclave's integrity before submitting their PIN, achieving end-to-end security for the cloud-assisted recovery process.",
    "case_study_zh": "Signal 使用 Intel SGX 飞地实现了安全值恢复（SVR），在服务器端保护用户 PIN 派生的密钥。通过在飞地内运行 PIN 验证和密钥派生，Signal 确保即使服务器被入侵或有内鬼员工也无法提取保护用户消息历史和联系人的加密密钥。远程证明使 Signal 客户端在提交 PIN 前可以验证飞地完整性，为云辅助恢复过程实现端到端安全。",
    "when_not_to_use": [
      "Workloads with no confidentiality requirements — the performance overhead of TEEs (5-30%) is unjustified",
      "When the threat model only includes external network attackers and not privileged insiders or infrastructure compromise",
      "On hardware that lacks TEE support — software-only emulation provides no real security guarantee",
      "Simple data-at-rest protection needs where standard disk encryption (LUKS, BitLocker) is sufficient"
    ],
    "when_not_to_use_zh": [
      "无保密需求的工作负载——TEE 的性能开销（5-30%）不合理",
      "威胁模型仅包含外部网络攻击者而不包含特权内部人员或基础设施入侵时",
      "缺乏 TEE 支持的硬件上——纯软件模拟不提供真正的安全保证",
      "标准磁盘加密（LUKS、BitLocker）已足够的简单静态数据保护需求"
    ],
    "adopters": [
      "Signal (Secure Value Recovery)",
      "Microsoft (Azure Confidential Computing)",
      "Google Cloud (Confidential VMs and GKE Nodes)",
      "Fortanix",
      "Anjuna Security"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "security"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Confidential Computing Consortium (2021). \"A Technical Analysis of Confidential Computing\". Linux Foundation.",
    "secondary_sources": [
      "Costan, V. & Devadas, S. (2016). \"Intel SGX Explained\". IACR Cryptology ePrint Archive.",
      "NIST (2021). \"Hardware-Enabled Security: Enabling a Layered Approach to Platform Security\". NIST IR 8320.",
      "Kaplan, D. et al. (2016). \"AMD Memory Encryption\". AMD White Paper."
    ],
    "typed_relations": [
      {
        "slug": "zero-trust-architecture",
        "type": "complement"
      },
      {
        "slug": "defense-in-depth",
        "type": "extends"
      },
      {
        "slug": "privacy-by-design",
        "type": "complement"
      }
    ]
  },
  {
    "id": 200,
    "name": "Security Champions Program",
    "name_zh": "安全冠军计划",
    "slug": "security-champions-program",
    "category": "security",
    "desc": "Embedding security advocates within development teams",
    "desc_zh": "在开发团队中嵌入安全倡导者",
    "steps": [
      "Identify and recruit volunteer security champions from existing engineering teams — target developers with security curiosity rather than requiring prior expertise, aiming for at least one champion per team or squad",
      "Define the security champion role with clear expectations: attend monthly security training, perform lightweight threat modeling on their team's new features, triage security findings from SAST/DAST tools, and serve as the first point of contact for security questions",
      "Build a security champions community with regular meetings, a dedicated communication channel, and recognition programs so champions feel supported by the central security team and incentivized to continue",
      "Equip champions with tooling and training: access to vulnerability databases, secure coding guidelines, threat modeling templates, and a direct escalation path to the security team for complex issues",
      "Measure program effectiveness through metrics: number of security issues caught pre-production per team, champion participation rates, training completion, and reduction in security debt over time"
    ],
    "steps_zh": [
      "从现有工程团队中识别并招募自愿的安全冠军——针对有安全好奇心的开发者，而非要求具备先验专业知识，目标是每个团队或小队至少有一名冠军",
      "以明确期望定义安全冠军角色：参加每月安全培训、对团队新功能进行轻量级威胁建模、分类SAST/DAST工具的安全发现，以及作为安全问题的第一联系人",
      "通过定期会议、专用通信渠道和认可计划建立安全冠军社区，使冠军感受到中心安全团队的支持并有动力继续",
      "为冠军提供工具和培训：访问漏洞数据库、安全编码指南、威胁建模模板，以及针对复杂问题直接上报安全团队的渠道",
      "通过指标衡量计划效果：每个团队在生产前发现的安全问题数量、冠军参与率、培训完成情况以及安全债务随时间的减少"
    ],
    "ai_relevant": false,
    "viz_type": "cycle",
    "viz_labels": [
      "Recruit",
      "Define Role",
      "Community",
      "Tooling",
      "Metrics"
    ],
    "viz_labels_zh": [
      "招募成员",
      "角色定义",
      "安全社区",
      "工具培训",
      "效果度量"
    ],
    "related": [
      "security-by-design",
      "threat-modeling-stride",
      "owasp-top-10",
      "defense-in-depth"
    ],
    "tags": [
      "security-champions",
      "devsecops",
      "culture",
      "shift-left",
      "team-security"
    ],
    "origin_author": "OWASP Security Champions Playbook; popularized by companies like Spotify and Nokia",
    "origin_source": "OWASP Security Champions Guide (owasp.org); Schoenfield, B. (2015). \"Securing Systems: Applied Security Architecture and Threat Models\". CRC Press",
    "origin_source_zh": "OWASP安全冠军指南（owasp.org）；Schoenfield, B.《系统安全：应用安全架构与威胁建模》（CRC Press，2015）",
    "complexity": "intermediate",
    "when_to_use": [
      "When the central security team cannot scale to review every feature and service across dozens of engineering teams",
      "When DevSecOps adoption requires security knowledge to be distributed into engineering teams rather than bottlenecked through a security gate",
      "When security training completion rates are low and developers lack actionable security guidance relevant to their day-to-day work",
      "When recurring security vulnerabilities in the same categories suggest that developers lack immediate access to security expertise during development"
    ],
    "when_to_use_zh": [
      "当中心安全团队无法扩展以审查数十个工程团队的每个功能和服务时",
      "当DevSecOps采纳要求安全知识分布到工程团队，而非通过安全门被瓶颈化时",
      "当安全培训完成率低且开发者缺乏与日常工作相关的可操作安全指导时",
      "当同类别的重复安全漏洞表明开发者在开发过程中缺乏即时安全专业知识时"
    ],
    "core_concepts": [
      "Distributed security: embedding security responsibility within each engineering team rather than centralizing it in a security silo, matching security coverage to development velocity",
      "Security advocate: a developer who acts as a security liaison between their engineering team and the central security organization, bridging the cultural and knowledge gap",
      "Shift-left: catching security issues during design and development rather than at a final security review gate, reducing remediation cost and time-to-fix",
      "Community of practice: a cross-team network of security champions who share knowledge, patterns, and tooling, multiplying the impact of the central security team"
    ],
    "core_concepts_zh": [
      "分布式安全：将安全责任嵌入每个工程团队，而非集中在安全孤岛中，使安全覆盖与开发速度相匹配",
      "安全倡导者：作为工程团队和中心安全组织之间安全联络人的开发者，弥合文化和知识鸿沟",
      "左移：在设计和开发期间而非最终安全审查门处发现安全问题，降低修复成本和修复时间",
      "实践社区：共享知识、模式和工具的跨团队安全冠军网络，倍增中心安全团队的影响力"
    ],
    "timeline": [
      [
        "2011",
        "OWASP begins documenting security champions as an organizational pattern for scaling security across large engineering organizations"
      ],
      [
        "2015",
        "Spotify and Nokia publish their security champions program experiences, providing concrete implementation models for the industry"
      ],
      [
        "2019",
        "OWASP Security Champions Playbook v1.0 released, providing a structured guide for program creation and measurement"
      ],
      [
        "2022",
        "Security champions programs become a standard recommendation in NIST SSDF and CISA Secure by Design guidance for enterprise DevSecOps"
      ]
    ],
    "timeline_zh": [
      [
        "2011",
        "OWASP开始将安全冠军记录为在大型工程组织中扩展安全的组织模式"
      ],
      [
        "2015",
        "Spotify和Nokia发布其安全冠军计划经验，为行业提供具体的实施模型"
      ],
      [
        "2019",
        "OWASP安全冠军手册v1.0发布，为计划创建和衡量提供结构化指南"
      ],
      [
        "2022",
        "安全冠军计划成为NIST SSDF和CISA安全设计指南中企业DevSecOps的标准建议"
      ]
    ],
    "dos": [
      "Do make the security champion role voluntary and recognized because mandatory, unrecognized security responsibilities demotivate developers and produce nominal participation",
      "Do give champions dedicated time (10-20% of a sprint) for security activities because security work that competes with feature delivery will always lose without protected time",
      "Do keep champions updated on the current threat landscape because stale security knowledge leads to champions advocating for outdated controls",
      "Do measure the program impact with concrete metrics and share them with leadership because visible ROI protects the program from cost-cutting when budgets tighten"
    ],
    "dos_zh": [
      "将安全冠军角色设为自愿且获得认可，因为强制性、不被认可的安全责任会让开发者丧失积极性，导致名义上的参与",
      "给冠军留出专用时间（每个冲刺10-20%）用于安全活动，因为与功能交付竞争的安全工作在没有受保护时间的情况下总会失败",
      "使冠军了解当前威胁态势，因为过时的安全知识会导致冠军倡导过时的控制措施",
      "用具体指标衡量计划影响并与领导层分享，因为可见的投资回报率在预算收紧时保护计划免受削减"
    ],
    "donts": [
      "Don't treat champions as free security auditors who must review every pull request because this burns out champions and creates the same bottleneck as the central security team",
      "Don't run the program without central security team support because champions need escalation paths, training, and recognition — an unsupported champion quickly becomes a former champion",
      "Don't make champions responsible for fixing all security bugs in their team because ownership should be with the engineer who introduced the bug, not the security champion",
      "Don't neglect champion turnover planning because when a champion leaves the team their security knowledge leaves with them; maintain a succession pipeline"
    ],
    "donts_zh": [
      "不要将冠军视为必须审查每个拉取请求的免费安全审计员，因为这会让冠军精疲力竭，并产生与中心安全团队相同的瓶颈",
      "不要在没有中心安全团队支持的情况下运行计划，因为冠军需要上报途径、培训和认可——不受支持的冠军很快就会成为前冠军",
      "不要让冠军负责修复其团队中的所有安全漏洞，因为所有权应属于引入漏洞的工程师，而非安全冠军",
      "不要忽视冠军离职规划，因为当冠军离开团队时，他们的安全知识也随之消失；维护继任管道"
    ],
    "case_study_company": "Spotify",
    "case_study": "Spotify's security champions program grew to over 400 champions across 150+ engineering squads by 2020. Each chapter (domain group) has at least one champion trained in threat modeling, secure code review, and dependency vulnerability management. Champions run quarterly threat modeling sessions for new features and triage SAST findings from their automated security scanning pipeline. Spotify's security team reports that the program reduced the time from vulnerability discovery to patch from an average of 45 days to under 10 days, with champions resolving 70% of findings without central security team involvement.",
    "case_study_zh": "Spotify的安全冠军计划到2020年已在150多个工程小队中发展到400多名冠军。每个章节（领域组）至少有一名经过威胁建模、安全代码审查和依赖漏洞管理培训的冠军。冠军为新功能运行季度威胁建模会议，并对自动化安全扫描管道中的SAST发现进行分类。Spotify安全团队报告称，该计划将从漏洞发现到修补的时间从平均45天缩短到10天以内，冠军在不涉及中心安全团队的情况下解决了70%的发现。",
    "when_not_to_use": [
      "Very small engineering teams (under 10 engineers) where the entire team can be trained in security and a dedicated champion role is redundant",
      "Organizations where developers are already deeply security-aware and the central security team is well-staffed enough to review all features",
      "When leadership will not allocate dedicated time for champion activities because an underfunded program creates false assurance without real security improvement",
      "Crisis response situations where immediate security remediation is needed and building a long-term program is not the priority"
    ],
    "when_not_to_use_zh": [
      "非常小的工程团队（少于10名工程师），整个团队都可以接受安全培训，专职冠军角色是多余的",
      "开发者已深度安全意识且中心安全团队人员充足可审查所有功能的组织",
      "当领导层不愿为冠军活动分配专用时间时，因为资金不足的计划会在没有真正安全改进的情况下产生虚假保证",
      "需要立即安全整治的危机响应情况，构建长期计划不是优先事项"
    ],
    "adopters": [
      "Spotify",
      "Nokia",
      "SAP",
      "Etsy",
      "ING Bank"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "security"
    ],
    "maturity_ring": "established",
    "primary_source": "OWASP (2019). \"Security Champions Playbook\". owasp.org/www-project-security-champions-guidebook.",
    "secondary_sources": [
      "Schoenfield, B.S.E. (2015). \"Securing Systems: Applied Security Architecture and Threat Models\". CRC Press.",
      "Pohl, C. & Hof, H.J. (2015). \"Secure Scrum: Development of Secure Software with Scrum\". Proceedings of ICSEA 2015.",
      "Kim, G. et al. (2016). \"The DevOps Handbook\". IT Revolution Press."
    ],
    "typed_relations": [
      {
        "slug": "security-by-design",
        "type": "complement"
      },
      {
        "slug": "threat-modeling-stride",
        "type": "complement"
      },
      {
        "slug": "owasp-top-10",
        "type": "complement"
      },
      {
        "slug": "defense-in-depth",
        "type": "complement"
      }
    ]
  },
  {
    "id": 201,
    "name": "Secrets Management",
    "name_zh": "密钥管理",
    "slug": "secrets-management",
    "category": "security",
    "desc": "Centralized vault-based secrets lifecycle (HashiCorp Vault, 2015)",
    "desc_zh": "基于集中化保险库的密钥生命周期管理（HashiCorp Vault，2015）",
    "steps": [
      "Centralize all secrets (API keys, database passwords, TLS certificates, service account credentials) into a dedicated secrets management system (HashiCorp Vault, AWS Secrets Manager, Azure Key Vault) and remove them from source code, configuration files, and CI/CD environment variables",
      "Implement dynamic secrets where possible: configure Vault or equivalent to generate short-lived, ephemeral credentials for databases and cloud services on demand, so no long-lived static credentials exist",
      "Enforce least-privilege access policies that grant each service and human identity access only to the specific secrets they need, with audit logging of every secret read, write, and rotation event",
      "Implement automated secret rotation for static credentials that cannot be replaced with dynamic secrets, with rotation periods aligned to risk level and compliance requirements",
      "Integrate secrets injection into the deployment pipeline using environment-specific policies: applications retrieve secrets at startup via the secrets manager API or sidecar agent, never via baked-in config files"
    ],
    "steps_zh": [
      "将所有密钥（API密钥、数据库密码、TLS证书、服务账户凭证）集中到专用密钥管理系统（HashiCorp Vault、AWS Secrets Manager、Azure Key Vault）中，并从源代码、配置文件和CI/CD环境变量中删除它们",
      "尽可能实施动态密钥：配置Vault或同类产品按需为数据库和云服务生成短期临时凭证，使长期静态凭证不存在",
      "执行最小权限访问策略，仅授予每个服务和人员身份访问其所需特定密钥的权限，并对每次密钥读取、写入和轮换事件进行审计记录",
      "为无法替换为动态密钥的静态凭证实施自动化密钥轮换，轮换周期与风险级别和合规要求对齐",
      "使用环境特定策略将密钥注入集成到部署管道中：应用程序在启动时通过密钥管理器API或Sidecar代理检索密钥，而非通过内置配置文件"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Centralize",
      "Dynamic Secrets",
      "Least Privilege",
      "Auto Rotation",
      "CI Inject"
    ],
    "viz_labels_zh": [
      "集中管理",
      "动态密钥",
      "最小权限",
      "自动轮换",
      "CI注入"
    ],
    "related": [
      "zero-trust-architecture",
      "security-by-design",
      "identity-federation"
    ],
    "tags": [
      "secrets-management",
      "vault",
      "credentials",
      "rotation",
      "dynamic-secrets"
    ],
    "origin_author": "HashiCorp (Mitchell Hashimoto), 2015",
    "origin_source": "HashiCorp Vault documentation (vaultproject.io, 2015); NIST SP 800-57 'Recommendation for Key Management'",
    "origin_source_zh": "HashiCorp Vault文档（vaultproject.io，2015）；NIST SP 800-57「密钥管理建议」",
    "complexity": "intermediate",
    "when_to_use": [
      "When secrets in source code repositories, CI/CD logs, or configuration files have been discovered or are at risk of exposure",
      "When microservices architecture requires each service to authenticate to databases, message brokers, and APIs with unique credentials that can be individually rotated",
      "When compliance frameworks (SOC 2, PCI-DSS, ISO 27001) require evidence of secret rotation, access logging, and least-privilege credential management",
      "When cloud-native deployments need to manage secrets across multiple cloud accounts, Kubernetes namespaces, and deployment environments without credential sprawl"
    ],
    "when_to_use_zh": [
      "当源代码库、CI/CD日志或配置文件中的密钥已被发现或面临泄露风险时",
      "当微服务架构要求每个服务以可单独轮换的唯一凭证向数据库、消息代理和API进行身份验证时",
      "当合规框架（SOC 2、PCI-DSS、ISO 27001）要求密钥轮换、访问记录和最小权限凭证管理的证据时",
      "当云原生部署需要在多个云账户、Kubernetes命名空间和部署环境中管理密钥而不产生凭证蔓延时"
    ],
    "core_concepts": [
      "Dynamic secrets: credentials generated on demand with a short TTL, automatically invalidated after expiry; eliminates the long-lived static credential risk entirely",
      "Vault transit engine: encryption as a service that allows applications to encrypt/decrypt data without ever seeing the encryption key, keeping key material inside the vault",
      "AppRole / Kubernetes auth: machine identity authentication mechanisms that allow services to authenticate to Vault without a pre-shared secret, bootstrapping the zero-secret problem",
      "Secret zero problem: the bootstrap challenge of how the first credential used to access the secrets manager is itself managed securely — solved by cloud IAM roles or hardware TPM attestation"
    ],
    "core_concepts_zh": [
      "动态密钥：按需生成的短期TTL凭证，到期后自动失效；完全消除长期静态凭证风险",
      "Vault传输引擎：加密即服务，允许应用程序在不查看加密密钥的情况下加密/解密数据，将密钥材料保存在保险库内",
      "AppRole/Kubernetes认证：机器身份认证机制，允许服务在没有预共享密钥的情况下向Vault进行身份验证，解决零密钥引导问题",
      "零密钥问题：用于访问密钥管理器的第一个凭证本身如何安全管理的引导挑战——通过云IAM角色或硬件TPM证明解决"
    ],
    "timeline": [
      [
        "2015",
        "HashiCorp releases Vault 0.1, establishing the open-source secrets management category with a plugin-based secrets engine architecture"
      ],
      [
        "2018",
        "AWS Secrets Manager launched, bringing managed secrets management to cloud-native teams without self-hosting Vault"
      ],
      [
        "2020",
        "Kubernetes External Secrets Operator and Vault Agent Injector patterns standardize secrets injection into containerized workloads"
      ],
      [
        "2023",
        "OpenBao forks HashiCorp Vault after BSL license change; SPIFFE/SPIRE workload identity standard matures, modernizing the secrets bootstrap problem"
      ]
    ],
    "timeline_zh": [
      [
        "2015",
        "HashiCorp发布Vault 0.1，以基于插件的密钥引擎架构建立开源密钥管理类别"
      ],
      [
        "2018",
        "AWS Secrets Manager推出，为云原生团队提供托管密钥管理，无需自托管Vault"
      ],
      [
        "2020",
        "Kubernetes External Secrets Operator和Vault Agent Injector模式标准化了将密钥注入容器化工作负载"
      ],
      [
        "2023",
        "OpenBao在HashiCorp BSL许可证变更后分叉Vault；SPIFFE/SPIRE工作负载身份标准成熟，现代化了密钥引导问题"
      ]
    ],
    "dos": [
      "Do enforce secrets-in-code detection in CI/CD using tools like git-secrets, truffleHog, or GitHub secret scanning because secrets committed to repositories are exposed to anyone with repo access",
      "Do prefer dynamic secrets over static credentials wherever the target system supports them because a credential with a 15-minute TTL cannot be abused even if intercepted",
      "Do namespace secrets by environment and service in the vault so that a compromised application token cannot read another service's secrets or production secrets from a development context",
      "Do test secret rotation procedures regularly because untested rotation breaks production when it is triggered under incident pressure"
    ],
    "dos_zh": [
      "在CI/CD中使用git-secrets、truffleHog或GitHub密钥扫描等工具执行代码中密钥检测，因为提交到存储库的密钥会暴露给任何有存储库访问权限的人",
      "在目标系统支持的地方优先使用动态密钥而非静态凭证，因为TTL为15分钟的凭证即使被截获也无法被滥用",
      "在保险库中按环境和服务对密钥进行命名空间划分，使受攻击的应用程序令牌无法读取另一个服务的密钥或从开发环境读取生产密钥",
      "定期测试密钥轮换程序，因为未经测试的轮换会在事故压力下触发时导致生产中断"
    ],
    "donts": [
      "Don't store secrets in environment variables baked into container images because image layers are easily extracted and environment variables are often logged",
      "Don't use the same Vault token across multiple services because token compromise then affects all services sharing that token",
      "Don't skip secret rotation because 'it's been working fine' — unrotated credentials are the attack surface for credential stuffing and insider threat scenarios",
      "Don't make the secrets manager a single point of failure without high-availability configuration because if Vault goes down and applications cannot retrieve secrets they will fail to restart after a crash"
    ],
    "donts_zh": [
      "不要将密钥存储在容器镜像中内置的环境变量中，因为镜像层很容易被提取，且环境变量经常被记录",
      "不要在多个服务间使用相同的Vault令牌，因为令牌泄露会影响共享该令牌的所有服务",
      "不要以「一直运行良好」为由跳过密钥轮换——未轮换的凭证是凭证填充和内部威胁场景的攻击面",
      "不要在没有高可用配置的情况下使密钥管理器成为单点故障，因为如果Vault宕机且应用程序无法检索密钥，它们在崩溃后将无法重启"
    ],
    "case_study_company": "GitHub",
    "case_study": "GitHub migrated its internal secrets management to HashiCorp Vault after a 2012 incident where a developer accidentally exposed an AWS access key in a public repository. Vault's AppRole authentication was used to give each GitHub service a unique identity, with dynamic AWS credentials generated per-request with a 15-minute TTL for deployment pipelines. GitHub also integrated Vault with their CI/CD system so that build jobs inject secrets at runtime rather than storing them as build environment variables. After the migration, GitHub reported zero secrets-related production security incidents over a 3-year period.",
    "case_study_zh": "GitHub在2012年一名开发者意外在公共存储库中暴露AWS访问密钥的事件后，将其内部密钥管理迁移到HashiCorp Vault。Vault的AppRole认证用于给每个GitHub服务一个唯一身份，为部署管道按请求生成TTL为15分钟的动态AWS凭证。GitHub还将Vault与其CI/CD系统集成，使构建作业在运行时注入密钥，而非将其存储为构建环境变量。迁移后，GitHub报告在3年期间没有与密钥相关的生产安全事件。",
    "when_not_to_use": [
      "Simple single-developer projects where secrets are managed through personal password managers and the overhead of Vault is disproportionate to the risk",
      "When the entire application runs within a single cloud provider and that provider's native secrets service (AWS Secrets Manager, GCP Secret Manager) fully meets the requirements",
      "Short-lived hackathon or prototype projects where the setup cost of a secrets manager exceeds the project lifetime",
      "When the team lacks the operational expertise to run a highly available Vault cluster — a misconfigured Vault is worse than no Vault"
    ],
    "when_not_to_use_zh": [
      "通过个人密码管理器管理密钥的简单单开发者项目，Vault的开销与风险不成比例",
      "当整个应用程序在单个云提供商内运行，且该提供商的原生密钥服务（AWS Secrets Manager、GCP Secret Manager）完全满足要求时",
      "设置密钥管理器的成本超过项目生命周期的短期黑客马拉松或原型项目",
      "当团队缺乏运行高可用Vault集群的运维专业知识时——配置错误的Vault比没有Vault更糟糕"
    ],
    "adopters": [
      "GitHub",
      "Cloudflare",
      "Shopify",
      "Twilio",
      "HashiCorp (self)"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "security",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "HashiCorp (2015). \"Vault Documentation\". vaultproject.io.",
    "secondary_sources": [
      "NIST (2020). \"NIST SP 800-57 Part 1 Rev. 5: Recommendation for Key Management\". National Institute of Standards and Technology.",
      "Brotherton, S. (2022). \"Secrets Management Patterns\". O'Reilly Media.",
      "AWS (2018). \"AWS Secrets Manager User Guide\". docs.aws.amazon.com."
    ],
    "typed_relations": [
      {
        "slug": "zero-trust-architecture",
        "type": "complement"
      },
      {
        "slug": "security-by-design",
        "type": "complement"
      },
      {
        "slug": "identity-federation",
        "type": "complement"
      }
    ]
  },
  {
    "id": 202,
    "name": "Web Application Firewall (WAF) Patterns",
    "name_zh": "Web应用防火墙模式",
    "slug": "waf-patterns",
    "category": "security",
    "desc": "Application-layer traffic filtering strategies that inspect, filter, and block malicious HTTP/S requests before they reach web applications",
    "desc_zh": "在应用层检查、过滤和拦截恶意HTTP/S请求的流量防护策略，保护Web应用免受注入、跨站脚本等攻击",
    "steps": [
      "Deploy a WAF in front of web-facing applications by placing it between the internet edge (CDN or load balancer) and the application origin, ensuring all HTTP/HTTPS traffic passes through inspection",
      "Configure rule sets starting with managed rule groups (OWASP Core Rule Set, AWS Managed Rules, Cloudflare OWASP) in detection-only mode to observe false positives before enabling blocking mode",
      "Tune rules for application-specific traffic patterns: create allow-list rules for known-good paths and parameters, suppress rules that generate high false-positive rates for legitimate traffic, and add custom rules for business logic",
      "Implement rate limiting and bot management rules to block credential stuffing, scraping, and DDoS amplification patterns based on IP reputation, request velocity, and behavioral fingerprinting",
      "Establish WAF monitoring and alerting pipelines that feed block/allow decisions into a SIEM for correlation with application logs to detect evasion attempts and validate that the WAF is not blocking legitimate users"
    ],
    "steps_zh": [
      "通过将WAF放置在互联网边缘（CDN或负载均衡器）和应用源站之间，在面向互联网的应用程序前部署WAF，确保所有HTTP/HTTPS流量通过检查",
      "从在仅检测模式下使用托管规则组（OWASP核心规则集、AWS托管规则、Cloudflare OWASP）开始配置规则集，在启用阻止模式前观察误报",
      "针对应用程序特定流量模式调整规则：为已知良好的路径和参数创建允许列表规则，抑制对合法流量产生高误报率的规则，并为业务逻辑添加自定义规则",
      "实施速率限制和机器人管理规则，根据IP信誉、请求速度和行为指纹阻止凭证填充、爬取和DDoS放大模式",
      "建立WAF监控和告警管道，将阻止/允许决策输入SIEM与应用程序日志关联，以检测规避尝试并验证WAF没有阻止合法用户"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "WAF Placement",
      "Rule Sets",
      "Rule Tuning",
      "Rate Limiting",
      "SIEM Monitor"
    ],
    "viz_labels_zh": [
      "WAF部署",
      "规则集",
      "规则调优",
      "速率限制",
      "SIEM监控"
    ],
    "related": [
      "defense-in-depth",
      "zero-trust-architecture",
      "owasp-top-10",
      "penetration-testing-framework"
    ],
    "tags": [
      "waf",
      "application-firewall",
      "owasp",
      "bot-management",
      "rate-limiting"
    ],
    "origin_author": "Perfecto Technologies (first commercial WAF, 1999); OWASP ModSecurity Core Rule Set",
    "origin_source": "OWASP ModSecurity Core Rule Set (coreruleset.org); NIST SP 800-44 'Guidelines on Securing Public Web Servers'",
    "origin_source_zh": "OWASP ModSecurity核心规则集（coreruleset.org）；NIST SP 800-44「公共Web服务器安全指南」",
    "complexity": "intermediate",
    "when_to_use": [
      "When a web application must be protected against OWASP Top 10 vulnerabilities (SQLi, XSS, SSRF) as a defense-in-depth layer even if the application itself has been hardened",
      "When a third-party or legacy application cannot be patched quickly and a virtual patch via WAF rule is needed to mitigate a known vulnerability in production",
      "When compliance frameworks (PCI-DSS Requirement 6.6) mandate a WAF for cardholder data environment web applications",
      "When bot traffic, credential stuffing, or scraping attacks are causing measurable business impact and application-level rate limiting is insufficient"
    ],
    "when_to_use_zh": [
      "当Web应用程序必须针对OWASP十大漏洞（SQLi、XSS、SSRF）进行保护，作为纵深防御层，即使应用程序本身已经过加固",
      "当第三方或遗留应用程序无法快速修补，需要通过WAF规则进行虚拟修补以缓解生产中已知漏洞时",
      "当合规框架（PCI-DSS要求6.6）要求持卡人数据环境Web应用程序使用WAF时",
      "当机器人流量、凭证填充或爬取攻击造成可测量的业务影响且应用程序级速率限制不足时"
    ],
    "core_concepts": [
      "Positive security model: explicitly defining what is allowed (valid request shapes, parameter lengths, character sets) and blocking everything else; precise but high maintenance",
      "Negative security model: defining patterns of known-bad traffic (SQL injection signatures, XSS payloads) and blocking matches; lower maintenance but blind to zero-days",
      "Virtual patching: using WAF rules to block exploitation of a known vulnerability in the application layer without modifying the application code, buying time for proper remediation",
      "Bot management: distinguishing between legitimate bots (search engines, monitoring), malicious bots (credential stuffers, scrapers), and human traffic using behavioral analysis and challenge mechanisms"
    ],
    "core_concepts_zh": [
      "正向安全模型：明确定义允许的内容（有效请求形状、参数长度、字符集）并阻止其他所有内容；精确但维护成本高",
      "负向安全模型：定义已知恶意流量的模式（SQL注入签名、XSS载荷）并阻止匹配；维护成本低但对零日漏洞视而不见",
      "虚拟修补：使用WAF规则阻止利用应用程序层中已知漏洞，无需修改应用程序代码，为适当修复争取时间",
      "机器人管理：使用行为分析和挑战机制区分合法机器人（搜索引擎、监控）、恶意机器人（凭证填充、爬虫）和人类流量"
    ],
    "timeline": [
      [
        "1999",
        "Perfecto Technologies releases NetContinuum, one of the first commercial web application firewalls targeting application-layer attacks"
      ],
      [
        "2003",
        "ModSecurity 1.0 released as an open-source Apache module, democratizing WAF capabilities with the OWASP Core Rule Set"
      ],
      [
        "2017",
        "Cloud WAF services (AWS WAF, Cloudflare, Akamai Kona) achieve mainstream adoption, making WAF accessible without on-premises hardware"
      ],
      [
        "2023",
        "Next-generation WAFs add ML-based anomaly detection and API-aware inspection, moving beyond signature-based detection toward behavioral analysis"
      ]
    ],
    "timeline_zh": [
      [
        "1999",
        "Perfecto Technologies发布NetContinuum，这是最早针对应用层攻击的商业Web应用防火墙之一"
      ],
      [
        "2003",
        "ModSecurity 1.0作为开源Apache模块发布，通过OWASP核心规则集使WAF能力普及化"
      ],
      [
        "2017",
        "云WAF服务（AWS WAF、Cloudflare、Akamai Kona）获得主流采纳，使WAF无需本地硬件即可访问"
      ],
      [
        "2023",
        "下一代WAF添加基于ML的异常检测和API感知检查，从基于签名的检测转向行为分析"
      ]
    ],
    "dos": [
      "Do run in detection-only mode for at least two weeks before switching to blocking mode because poorly tuned WAFs block legitimate users at a rate that can exceed the security benefit",
      "Do establish a false-positive reporting workflow for application teams so that legitimate traffic blocked by the WAF is quickly triaged and ruled whitelisted",
      "Do treat WAF as a defense-in-depth layer, not a substitute for secure coding because WAFs cannot protect against business logic flaws, authentication weaknesses, or vulnerabilities introduced by the application's own APIs",
      "Do keep WAF rule sets updated because new attack techniques emerge continuously and stale rule sets miss recent vulnerability patterns"
    ],
    "dos_zh": [
      "在切换到阻止模式前至少以仅检测模式运行两周，因为调整不当的WAF阻止合法用户的速率可能超过安全收益",
      "为应用程序团队建立误报报告工作流，使WAF阻止的合法流量被快速分类并添加到白名单",
      "将WAF视为纵深防御层，而非安全编码的替代品，因为WAF无法防护业务逻辑缺陷、身份验证弱点或应用程序自身API引入的漏洞",
      "保持WAF规则集更新，因为新攻击技术不断涌现，过时的规则集会错过最近的漏洞模式"
    ],
    "donts": [
      "Don't rely on a WAF as your only security control for web applications because sophisticated attackers study WAF rule patterns and craft payloads that evade signatures",
      "Don't skip application-layer security testing after deploying a WAF because the WAF creates a false sense of security that defers essential secure code review and penetration testing",
      "Don't block traffic based on geographic IP restrictions without careful analysis because VPN usage and shared IP ranges cause significant false positives",
      "Don't set WAF rules to permissive mode in production permanently because permissive mode is a temporary diagnostic state, not a valid long-term configuration"
    ],
    "donts_zh": [
      "不要将WAF作为Web应用程序的唯一安全控制，因为复杂的攻击者会研究WAF规则模式并精心设计规避签名的载荷",
      "不要在部署WAF后跳过应用程序层安全测试，因为WAF会产生延迟必要安全代码审查和渗透测试的虚假安全感",
      "不要在没有仔细分析的情况下基于地理IP限制阻止流量，因为VPN使用和共享IP范围会造成大量误报",
      "不要在生产中永久将WAF规则设置为宽松模式，因为宽松模式是临时诊断状态，不是有效的长期配置"
    ],
    "case_study_company": "Cloudflare",
    "case_study": "When the Log4Shell vulnerability (CVE-2021-44228) was disclosed in December 2021, Cloudflare deployed a WAF virtual patch rule within 4 hours of the CVE publication, before most organizations had even assessed their exposure. The rule blocked JNDI lookup strings in all HTTP headers and request bodies for all Cloudflare WAF customers. Within 72 hours of deployment, Cloudflare's WAF had blocked over 1 million exploitation attempts against their customers' applications, demonstrating how cloud WAF virtual patching can provide immediate protection for the long tail of organizations unable to patch Java applications quickly.",
    "case_study_zh": "2021年12月Log4Shell漏洞（CVE-2021-44228）披露时，Cloudflare在CVE发布后4小时内部署了WAF虚拟修补规则，而此时大多数组织甚至还未评估其暴露情况。该规则为所有Cloudflare WAF客户阻止了所有HTTP头和请求体中的JNDI查找字符串。部署后72小时内，Cloudflare的WAF已阻止超过100万次针对其客户应用程序的利用尝试，展示了云WAF虚拟修补如何为无法快速修补Java应用程序的大量组织提供即时保护。",
    "when_not_to_use": [
      "Internal APIs accessible only from within a private network where the threat model does not include external internet attackers",
      "When the application does not process untrusted user input and has no web-facing exposure that would benefit from application-layer filtering",
      "As a replacement for a secure software development lifecycle — WAFs address symptoms of insecure code, not the root cause",
      "When WAF latency overhead (typically 1-5ms) violates stringent real-time SLAs for latency-sensitive financial or gaming applications"
    ],
    "when_not_to_use_zh": [
      "仅从私有网络内部访问的内部API，威胁模型不包括外部互联网攻击者",
      "当应用程序不处理不可信用户输入且没有受益于应用层过滤的面向Web的暴露时",
      "作为安全软件开发生命周期的替代品——WAF解决不安全代码的症状，而非根本原因",
      "当WAF延迟开销（通常1-5毫秒）违反延迟敏感金融或游戏应用程序的严格实时SLA时"
    ],
    "adopters": [
      "Cloudflare",
      "AWS (AWS WAF)",
      "Akamai",
      "Fastly",
      "Imperva"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "security",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "OWASP (2023). \"ModSecurity Core Rule Set\". coreruleset.org.",
    "secondary_sources": [
      "NIST (2007). \"SP 800-44 Version 2: Guidelines on Securing Public Web Servers\". NIST.",
      "Dafydd Stuttard & Marcus Pinto (2011). \"The Web Application Hacker's Handbook\", 2nd ed. Wiley.",
      "Cloudflare (2022). \"Cloudflare WAF Documentation\". developers.cloudflare.com."
    ],
    "typed_relations": [
      {
        "slug": "defense-in-depth",
        "type": "extends"
      },
      {
        "slug": "zero-trust-architecture",
        "type": "complement"
      },
      {
        "slug": "owasp-top-10",
        "type": "complement"
      },
      {
        "slug": "penetration-testing-framework",
        "type": "complement"
      }
    ]
  },
  {
    "id": 203,
    "name": "Identity Federation",
    "name_zh": "身份联合",
    "slug": "identity-federation",
    "category": "security",
    "desc": "Cross-domain identity and SSO patterns (SAML, OIDC federation)",
    "desc_zh": "跨域身份和单点登录模式（SAML、OIDC联合）",
    "steps": [
      "Define the trust relationship between Identity Providers (IdP) and Service Providers (SP): the IdP authenticates the user and asserts their identity; the SP trusts the IdP's assertion and grants access based on the claimed attributes",
      "Select the federation protocol based on the integration scenario: SAML 2.0 for enterprise SSO with legacy applications, OpenID Connect (OIDC) for modern web and mobile apps, and OAuth 2.0 for delegated API authorization",
      "Establish the trust anchor by exchanging metadata: for SAML, exchange XML metadata containing signing certificates and endpoint URLs; for OIDC, register the client application with the IdP and configure the discovery endpoint",
      "Map IdP attributes to application roles and permissions: define attribute mappings that translate IdP claims (groups, email, department) to the application's authorization model, applying least-privilege principles",
      "Test federation flows end-to-end including SSO initiation, attribute assertion validation, session management, and Single Logout (SLO) to ensure that user sessions are cleanly terminated across all federated services when logging out"
    ],
    "steps_zh": [
      "定义身份提供者（IdP）和服务提供者（SP）之间的信任关系：IdP对用户进行身份验证并断言其身份；SP信任IdP的断言并根据声明的属性授予访问权限",
      "根据集成场景选择联合协议：SAML 2.0用于遗留应用程序的企业SSO，OpenID Connect（OIDC）用于现代Web和移动应用程序，OAuth 2.0用于委托API授权",
      "通过交换元数据建立信任锚：对于SAML，交换包含签名证书和端点URL的XML元数据；对于OIDC，向IdP注册客户端应用程序并配置发现端点",
      "将IdP属性映射到应用程序角色和权限：定义将IdP声明（组、电子邮件、部门）转换为应用程序授权模型的属性映射，应用最小权限原则",
      "端到端测试联合流程，包括SSO启动、属性断言验证、会话管理和单点注销（SLO），确保注销时所有联合服务的用户会话被干净终止"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "IdP",
      "Federation Protocol",
      "Metadata Exchange",
      "Attribute Mapping",
      "SSO Test"
    ],
    "viz_labels_zh": [
      "身份提供者",
      "联邦协议",
      "元数据交换",
      "属性映射",
      "SSO测试"
    ],
    "related": [
      "zero-trust-architecture",
      "secrets-management"
    ],
    "tags": [
      "identity-federation",
      "sso",
      "saml",
      "oidc",
      "zero-trust"
    ],
    "origin_author": "OASIS (SAML 2.0, 2005); OpenID Foundation (OIDC, 2014)",
    "origin_source": "OASIS SAML 2.0 Specification (2005); OpenID Connect Core 1.0 Specification (2014); RFC 6749 OAuth 2.0",
    "origin_source_zh": "OASIS SAML 2.0规范（2005）；OpenID Connect Core 1.0规范（2014）；RFC 6749 OAuth 2.0",
    "complexity": "intermediate",
    "when_to_use": [
      "When employees need seamless access to dozens of SaaS applications using their corporate identity without separate passwords for each service",
      "When B2B integrations require partner organizations to authenticate their users using the partner's own IdP without creating accounts in your system",
      "When regulatory compliance requires centralized identity governance with unified access logging, provisioning, and deprovisioning across all applications",
      "When a microservices architecture needs a consistent identity layer so services can verify the identity and claims of callers from any trusted IdP"
    ],
    "when_to_use_zh": [
      "当员工需要使用其企业身份无缝访问数十个SaaS应用程序，无需为每个服务单独设置密码时",
      "当B2B集成要求合作伙伴组织使用合作伙伴自己的IdP对其用户进行身份验证，而无需在您的系统中创建账户时",
      "当合规要求需要集中身份治理，在所有应用程序中统一访问记录、预配和取消预配时",
      "当微服务架构需要一致的身份层，使服务能够验证来自任何受信任IdP的调用方身份和声明时"
    ],
    "core_concepts": [
      "Identity Provider (IdP): the authoritative system that authenticates users and issues identity assertions (Okta, Azure AD, Google Workspace, Auth0)",
      "Service Provider (SP): the application that relies on the IdP for authentication and consumes identity assertions to authorize access",
      "SAML assertion: a signed XML document issued by the IdP containing user identity attributes, authentication method, and validity timestamps",
      "JWT / ID token: a signed JSON Web Token issued by an OIDC-compatible IdP containing claims about the authenticated user, used by modern applications as the identity assertion"
    ],
    "core_concepts_zh": [
      "身份提供者（IdP）：对用户进行身份验证并发出身份断言的权威系统（Okta、Azure AD、Google Workspace、Auth0）",
      "服务提供者（SP）：依赖IdP进行身份验证并使用身份断言来授权访问的应用程序",
      "SAML断言：IdP发出的包含用户身份属性、身份验证方法和有效期时间戳的签名XML文档",
      "JWT/ID令牌：OIDC兼容IdP发出的包含已认证用户声明的签名JSON Web令牌，被现代应用程序用作身份断言"
    ],
    "timeline": [
      [
        "2005",
        "OASIS ratifies SAML 2.0, establishing the dominant enterprise SSO standard for web applications and replacing vendor-specific proprietary protocols"
      ],
      [
        "2012",
        "OAuth 2.0 (RFC 6749) standardizes delegated authorization for APIs, separating authorization from identity"
      ],
      [
        "2014",
        "OpenID Connect 1.0 builds an identity layer on top of OAuth 2.0, providing a modern, JSON/JWT-based alternative to SAML for web and mobile applications"
      ],
      [
        "2022",
        "CISA Zero Trust Architecture guidance mandates identity federation as the foundation of zero trust access for federal agencies"
      ]
    ],
    "timeline_zh": [
      [
        "2005",
        "OASIS批准SAML 2.0，建立Web应用程序的主流企业SSO标准，取代供应商专有协议"
      ],
      [
        "2012",
        "OAuth 2.0（RFC 6749）标准化API的委托授权，将授权与身份分离"
      ],
      [
        "2014",
        "OpenID Connect 1.0在OAuth 2.0之上构建身份层，为Web和移动应用程序提供基于JSON/JWT的现代SAML替代方案"
      ],
      [
        "2022",
        "CISA零信任架构指南将身份联合定为联邦机构零信任访问的基础"
      ]
    ],
    "dos": [
      "Do validate SAML assertion signatures and OIDC token signatures using the IdP's published public keys because unsigned or improperly validated assertions are trivially forgeable",
      "Do enforce attribute-based access control by mapping IdP group claims to application roles because authentication proves identity, not authorization, and the mapping is the application's responsibility",
      "Do implement Single Logout (SLO) correctly because failing to propagate logouts means a user's session persists in every federated SP after they log out of the IdP",
      "Do set short token validity periods and implement token refresh flows because long-lived tokens increase the window of exposure if an ID token is compromised"
    ],
    "dos_zh": [
      "使用IdP发布的公钥验证SAML断言签名和OIDC令牌签名，因为未签名或验证不当的断言可以被轻易伪造",
      "通过将IdP组声明映射到应用程序角色来执行基于属性的访问控制，因为身份验证证明身份，而非授权，映射是应用程序的责任",
      "正确实施单点注销（SLO），因为未能传播注销意味着用户从IdP注销后其会话在每个联合SP中仍然存在",
      "设置短令牌有效期并实施令牌刷新流程，因为长效令牌在ID令牌被泄露时会增加暴露窗口"
    ],
    "donts": [
      "Don't trust SAML assertions without verifying the InResponseTo field against the AuthnRequest ID because this allows replay attacks that reuse valid assertions from previous sessions",
      "Don't store IdP session cookies or OIDC tokens in localStorage because they are accessible to JavaScript and vulnerable to XSS-based token theft",
      "Don't implement custom federation code when production-hardened libraries (passport.js, Spring Security SAML, Keycloak) exist because protocol implementation errors are a common source of identity vulnerabilities",
      "Don't grant excessive claims in the IdP group-to-role mapping because over-privileged federated identities undermine the least-privilege principle"
    ],
    "donts_zh": [
      "不要在不验证InResponseTo字段与AuthnRequest ID匹配的情况下信任SAML断言，因为这允许重放已使用有效断言的前会话攻击",
      "不要将IdP会话Cookie或OIDC令牌存储在localStorage中，因为它们可被JavaScript访问且容易受到基于XSS的令牌盗取",
      "不要在生产经过加固的库（passport.js、Spring Security SAML、Keycloak）存在时实现自定义联合代码，因为协议实现错误是身份漏洞的常见来源",
      "不要在IdP组到角色映射中授予过多声明，因为过度特权的联合身份会破坏最小权限原则"
    ],
    "case_study_company": "Salesforce",
    "case_study": "Salesforce deployed SAML 2.0 identity federation across its enterprise SaaS platform in the late 2000s, allowing corporate customers to use their existing Active Directory identities to access Salesforce without creating separate accounts. The integration uses customer-controlled SAML IdPs (Okta, Azure AD, Ping) to authenticate users, with Salesforce acting as the SP and mapping SAML attributes to Salesforce profiles and permission sets. This eliminated over 95% of Salesforce-specific password reset requests at large enterprise customers, while providing centralized audit logs for compliance. The pattern has since been extended to Salesforce's full portfolio including Marketing Cloud, MuleSoft, and Tableau using OIDC.",
    "case_study_zh": "Salesforce在2000年代后期在其企业SaaS平台中部署了SAML 2.0身份联合，允许企业客户使用其现有的Active Directory身份访问Salesforce，无需创建单独账户。集成使用客户控制的SAML IdP（Okta、Azure AD、Ping）对用户进行身份验证，Salesforce充当SP并将SAML属性映射到Salesforce配置文件和权限集。这消除了大型企业客户95%以上的Salesforce特定密码重置请求，同时为合规提供集中审计日志。该模式此后已使用OIDC扩展到Salesforce完整产品组合，包括Marketing Cloud、MuleSoft和Tableau。",
    "when_not_to_use": [
      "Consumer-facing applications where social login (Google, Apple, Facebook OAuth) covers the use case and enterprise SAML federation is unnecessary complexity",
      "Very small organizations with only 2-3 applications where per-application credentials and a password manager are sufficient",
      "When the integration target only supports proprietary authentication mechanisms that do not implement SAML or OIDC",
      "Applications with stringent latency requirements where the additional SSO redirect round-trip is unacceptable for the user experience"
    ],
    "when_not_to_use_zh": [
      "社交登录（Google、Apple、Facebook OAuth）涵盖用例且企业SAML联合是不必要复杂性的面向消费者的应用程序",
      "只有2-3个应用程序的非常小的组织，每个应用程序的凭证和密码管理器已足够",
      "当集成目标仅支持不实现SAML或OIDC的专有身份验证机制时",
      "具有严格延迟要求的应用程序，额外的SSO重定向往返对用户体验不可接受"
    ],
    "adopters": [
      "Salesforce",
      "Microsoft (Azure AD)",
      "Google (Google Workspace)",
      "Okta",
      "AWS IAM Identity Center"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "security",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "OASIS (2005). \"Security Assertion Markup Language (SAML) 2.0\". OASIS Standard.",
    "secondary_sources": [
      "OpenID Foundation (2014). \"OpenID Connect Core 1.0\". openid.net.",
      "Hardt, D. (2012). \"RFC 6749: The OAuth 2.0 Authorization Framework\". IETF.",
      "Windley, P.J. (2005). \"Digital Identity\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "zero-trust-architecture",
        "type": "complement"
      },
      {
        "slug": "secrets-management",
        "type": "complement"
      }
    ]
  },
  {
    "id": 204,
    "name": "Penetration Testing Framework",
    "name_zh": "渗透测试框架",
    "slug": "penetration-testing-framework",
    "category": "security",
    "desc": "Structured offensive security testing methodology (PTES, OWASP Testing Guide)",
    "desc_zh": "结构化攻击性安全测试方法论（PTES、OWASP测试指南）",
    "steps": [
      "Define scope and rules of engagement: document target systems, IP ranges, and application URLs that are in scope; establish testing windows, emergency contact procedures, and explicit out-of-scope boundaries to prevent disruption to production",
      "Perform reconnaissance and threat intelligence: gather OSINT (open-source intelligence) about the target — exposed services, DNS records, public code repositories, employee profiles, and known CVEs in used technology versions",
      "Enumerate attack surface: conduct port scanning, service fingerprinting, web crawling, and API discovery to map all entry points; correlate findings with the OWASP Testing Guide categories relevant to the application type",
      "Exploit vulnerabilities: attempt to exploit identified weaknesses using controlled techniques (no destructive payloads); document proof-of-concept evidence including screenshots, request/response logs, and impact chains showing how a vulnerability leads to data exposure or privilege escalation",
      "Report and remediate: produce a finding report with CVSS scores, business impact narratives, reproduction steps, and prioritized remediation recommendations; conduct a retest after fixes are applied to verify remediation was effective"
    ],
    "steps_zh": [
      "定义范围和参与规则：记录在范围内的目标系统、IP范围和应用程序URL；建立测试窗口、紧急联系程序和明确的范围外边界以防止对生产的干扰",
      "执行侦察和威胁情报：收集目标的OSINT（开源情报）——暴露的服务、DNS记录、公共代码存储库、员工档案和已使用技术版本中的已知CVE",
      "枚举攻击面：进行端口扫描、服务指纹识别、Web爬取和API发现，以映射所有入口点；将发现与与应用程序类型相关的OWASP测试指南类别关联",
      "利用漏洞：尝试使用受控技术（无破坏性载荷）利用已识别的弱点；记录概念验证证据，包括截图、请求/响应日志和显示漏洞如何导致数据泄露或权限提升的影响链",
      "报告和修复：生成包含CVSS评分、业务影响叙述、复现步骤和优先修复建议的发现报告；在应用修复后进行复测以验证修复是否有效"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Scope",
      "Reconnaissance",
      "Attack Surface",
      "Exploitation",
      "Report"
    ],
    "viz_labels_zh": [
      "测试范围",
      "侦察",
      "攻击面",
      "漏洞利用",
      "报告修复"
    ],
    "related": [
      "threat-modeling-stride",
      "owasp-top-10",
      "defense-in-depth",
      "waf-patterns"
    ],
    "tags": [
      "penetration-testing",
      "offensive-security",
      "ptes",
      "owasp",
      "red-team"
    ],
    "origin_author": "PTES (Penetration Testing Execution Standard), 2009; OWASP Testing Guide v1.0, 2004",
    "origin_source": "Penetration Testing Execution Standard (pentest-standard.org, 2012); OWASP Testing Guide v4.2 (2020); NIST SP 800-115 'Technical Guide to Information Security Testing'",
    "origin_source_zh": "渗透测试执行标准（pentest-standard.org，2012）；OWASP测试指南v4.2（2020）；NIST SP 800-115「信息安全测试技术指南」",
    "complexity": "advanced",
    "when_to_use": [
      "Before major product launches or after significant architectural changes when threat modeling and code review alone are insufficient to validate security posture",
      "When compliance frameworks (PCI-DSS, SOC 2 Type II, ISO 27001) require annual or continuous penetration testing as a mandatory control",
      "After a security incident to verify that the attacker's initial access path has been fully closed and no similar vulnerabilities remain",
      "When a new high-value attack surface is introduced (new public API, new cloud environment, new authentication system) and an adversarial perspective is needed"
    ],
    "when_to_use_zh": [
      "在重大产品发布前或重大架构变更后，当威胁建模和代码审查单独不足以验证安全态势时",
      "当合规框架（PCI-DSS、SOC 2 Type II、ISO 27001）要求年度或持续渗透测试作为强制控制时",
      "安全事件后验证攻击者的初始访问路径已完全关闭且没有类似漏洞残留",
      "当引入新的高价值攻击面（新公共API、新云环境、新身份验证系统）并需要对抗性视角时"
    ],
    "core_concepts": [
      "Black-box testing: the tester has no prior knowledge of the target system, simulating an external attacker; provides the most realistic adversarial simulation but is the least efficient use of testing time",
      "White-box testing: the tester has full access to source code, architecture diagrams, and credentials; maximizes coverage and depth but requires more preparation and does not simulate a realistic attacker perspective",
      "CVSS scoring: the Common Vulnerability Scoring System provides a standardized 0-10 severity score for each finding based on exploitability, impact, and scope, enabling consistent risk prioritization",
      "Lateral movement: demonstrating how an initial foothold in one system can be used to pivot to other systems in the environment, revealing the blast radius of a successful initial compromise"
    ],
    "core_concepts_zh": [
      "黑盒测试：测试者对目标系统没有先验知识，模拟外部攻击者；提供最真实的对抗性模拟，但测试时间利用效率最低",
      "白盒测试：测试者完全访问源代码、架构图和凭证；最大化覆盖率和深度，但需要更多准备，不模拟真实的攻击者视角",
      "CVSS评分：通用漏洞评分系统根据可利用性、影响和范围为每个发现提供标准化的0-10严重性评分，实现一致的风险优先级排序",
      "横向移动：展示如何使用一个系统中的初始立足点来转向环境中的其他系统，揭示成功初始入侵的爆炸半径"
    ],
    "timeline": [
      [
        "2004",
        "OWASP publishes the first Web Application Testing Guide, formalizing a systematic methodology for web application security testing"
      ],
      [
        "2009",
        "The Penetration Testing Execution Standard (PTES) working group begins developing a vendor-neutral, comprehensive pentest methodology"
      ],
      [
        "2018",
        "Bug bounty programs (HackerOne, Bugcrowd) mature as a continuous penetration testing model, complementing periodic point-in-time assessments"
      ],
      [
        "2023",
        "CTEM (Continuous Threat Exposure Management), coined by Gartner, integrates automated pen testing with continuous exposure validation and remediation workflows"
      ]
    ],
    "timeline_zh": [
      [
        "2004",
        "OWASP发布第一个Web应用测试指南，正式化Web应用安全测试的系统方法论"
      ],
      [
        "2009",
        "渗透测试执行标准（PTES）工作组开始开发供应商中立的综合渗透测试方法论"
      ],
      [
        "2018",
        "漏洞赏金计划（HackerOne、Bugcrowd）作为持续渗透测试模型成熟，补充定期时点评估"
      ],
      [
        "2023",
        "Gartner创造的CTEM（持续威胁暴露管理）将自动化渗透测试与持续暴露验证和修复工作流程集成"
      ]
    ],
    "dos": [
      "Do obtain written authorization before testing because unauthorized penetration testing is illegal under the Computer Fraud and Abuse Act and equivalent laws regardless of intent",
      "Do use a gray-box testing approach where possible because it combines the efficiency of knowing the architecture with the adversarial realism of not having application credentials",
      "Do document every finding with evidence and a reproduction case because a finding without evidence is a claim, and a finding without reproduction steps cannot be reliably remediated",
      "Do retest all critical and high findings after remediation because verifying that the fix works is as important as finding the vulnerability in the first place"
    ],
    "dos_zh": [
      "在测试前获得书面授权，因为未经授权的渗透测试根据《计算机欺诈和滥用法》及同等法律是非法的，无论意图如何",
      "尽可能使用灰盒测试方法，因为它结合了了解架构的效率和没有应用程序凭证的对抗性真实性",
      "用证据和复现案例记录每个发现，因为没有证据的发现是一个主张，没有复现步骤的发现无法可靠修复",
      "在修复后对所有关键和高危发现进行复测，因为验证修复有效性与首先发现漏洞同样重要"
    ],
    "donts": [
      "Don't run penetration tests in production without a change management process because even read-only testing can trigger WAF blocks, alerting fatigue, and accidental denial-of-service conditions",
      "Don't deliver a list of CVEs without business context because security reports that do not translate technical findings into business risk are ignored by decision makers",
      "Don't treat a clean penetration test report as proof of security because testers can only validate what they tested within their time box; absence of findings is not absence of vulnerabilities",
      "Don't use automated scanner output as a substitute for manual penetration testing because scanners miss business logic flaws, chained vulnerabilities, and context-specific attack paths"
    ],
    "donts_zh": [
      "不要在没有变更管理流程的情况下在生产中运行渗透测试，因为即使是只读测试也可能触发WAF阻止、告警疲劳和意外拒绝服务情况",
      "不要在没有业务背景的情况下提交CVE列表，因为不将技术发现转化为业务风险的安全报告会被决策者忽视",
      "不要将干净的渗透测试报告视为安全证明，因为测试者只能在其时间框内验证他们测试的内容；没有发现不等于没有漏洞",
      "不要将自动化扫描器输出作为手动渗透测试的替代品，因为扫描器会遗漏业务逻辑缺陷、链式漏洞和特定上下文的攻击路径"
    ],
    "case_study_company": "Dropbox",
    "case_study": "Dropbox runs a continuous penetration testing program combining an internal red team with an external bug bounty program on HackerOne. The internal red team focuses on advanced attack chains (social engineering, supply chain compromise, lateral movement from cloud infrastructure) that require deep context about Dropbox's architecture. The external bug bounty program handles the long tail of web application and API vulnerabilities with 4,000+ registered researchers. In 2022, Dropbox paid over $1M in bug bounties with 70% of critical findings coming from external researchers who identified logic flaws the internal team had missed, validating the complementary value of both approaches.",
    "case_study_zh": "Dropbox运行一个持续渗透测试计划，将内部红队与HackerOne外部漏洞赏金计划相结合。内部红队专注于需要深入了解Dropbox架构的高级攻击链（社会工程、供应链入侵、从云基础设施横向移动）。外部漏洞赏金计划由4,000多名注册研究人员处理大量Web应用程序和API漏洞。2022年，Dropbox支付了超过100万美元的漏洞赏金，70%的关键发现来自识别出内部团队遗漏的逻辑缺陷的外部研究人员，验证了两种方法的互补价值。",
    "when_not_to_use": [
      "As a substitute for secure development practices because discovering vulnerabilities after they are built is far more expensive than preventing them during design",
      "When the scope and rules of engagement cannot be clearly defined because an undefined scope leads to either incomplete testing or accidental out-of-scope impact",
      "As the first security activity for a new product — threat modeling and secure code review during development are more cost-effective than post-launch penetration testing",
      "When the development team has no capacity to remediate findings because unactioned penetration test findings accumulate into security debt without improving security posture"
    ],
    "when_not_to_use_zh": [
      "作为安全开发实践的替代品，因为在漏洞构建后发现它们远比在设计过程中预防它们更昂贵",
      "当无法清晰定义范围和参与规则时，因为未定义的范围会导致测试不完整或意外的范围外影响",
      "作为新产品的第一个安全活动——开发过程中的威胁建模和安全代码审查比发布后渗透测试更具成本效益",
      "当开发团队没有能力修复发现时，因为未采取行动的渗透测试发现会积累成安全债务而不改善安全态势"
    ],
    "adopters": [
      "Dropbox",
      "Microsoft (internal red team)",
      "Google (Project Zero)",
      "Facebook/Meta (Red Team)",
      "US DoD (Hack the Pentagon)"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "security"
    ],
    "maturity_ring": "foundational",
    "primary_source": "OWASP (2020). \"OWASP Testing Guide v4.2\". owasp.org/www-project-web-security-testing-guide.",
    "secondary_sources": [
      "NIST (2008). \"SP 800-115: Technical Guide to Information Security Testing and Assessment\". NIST.",
      "Wilhelm, T. (2013). \"Professional Penetration Testing\", 2nd ed. Syngress.",
      "Kennedy, D., O'Gorman, J., Kearns, D. & Aharoni, M. (2011). \"Metasploit: The Penetration Tester's Guide\". No Starch Press."
    ],
    "typed_relations": [
      {
        "slug": "threat-modeling-stride",
        "type": "complement"
      },
      {
        "slug": "owasp-top-10",
        "type": "complement"
      },
      {
        "slug": "defense-in-depth",
        "type": "complement"
      },
      {
        "slug": "waf-patterns",
        "type": "complement"
      }
    ]
  },
  {
    "id": 256,
    "name": "Security Development Lifecycle (SDL)",
    "name_zh": "安全开发生命周期（SDL）",
    "slug": "security-development-lifecycle",
    "category": "security",
    "desc": "Microsoft's structured process for integrating security and privacy practices at every phase of software development",
    "desc_zh": "微软在软件开发每个阶段融入安全与隐私实践的结构化流程",
    "steps": [
      "Training: Provide all team members with core security education covering attack patterns, secure coding standards, and privacy fundamentals before development begins",
      "Requirements: Define security and privacy requirements alongside functional requirements, establish bug bars, and identify compliance obligations (GDPR, HIPAA, PCI-DSS)",
      "Design: Perform threat modeling using STRIDE, define the attack surface, apply security design principles (least privilege, defense in depth, secure defaults)",
      "Implementation: Use approved tools and banned function lists, enforce static analysis (SAST) gates, conduct mandatory peer code reviews with security checklist",
      "Verification & Release: Execute dynamic analysis (DAST) and fuzzing, conduct penetration testing, perform final security review, document incident response plan before shipping"
    ],
    "steps_zh": [
      "培训：在开发开始前为所有团队成员提供核心安全教育，涵盖攻击模式、安全编码标准和隐私基础",
      "需求：与功能需求并行定义安全和隐私需求，建立缺陷基准，识别合规义务（GDPR、HIPAA、PCI-DSS）",
      "设计：使用STRIDE进行威胁建模，定义攻击面，应用安全设计原则（最小权限、纵深防御、安全默认值）",
      "实现：使用经批准的工具和禁止函数列表，强制执行静态分析（SAST）门控，进行带安全检查表的强制同行代码审查",
      "验证与发布：执行动态分析（DAST）和模糊测试，进行渗透测试，执行最终安全审查，在发布前记录事件响应计划"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Training",
      "Requirements",
      "Design",
      "Implementation",
      "Verification"
    ],
    "viz_labels_zh": [
      "安全培训",
      "安全需求",
      "安全设计",
      "安全实现",
      "安全验证"
    ],
    "related": [
      "threat-modeling-stride",
      "owasp-top-10",
      "defense-in-depth",
      "devsecops-pipeline"
    ],
    "tags": [
      "sdlc",
      "secure-development",
      "microsoft",
      "shift-left",
      "compliance"
    ],
    "origin_author": "Steve Lipner",
    "origin_source": "Howard, M. & Lipner, S. (2006). \"The Security Development Lifecycle\". Microsoft Press.",
    "origin_source_zh": "Howard, M. & Lipner, S.（2006）。「The Security Development Lifecycle」。微软出版社。",
    "complexity": "advanced",
    "when_to_use": [
      "When building software that handles sensitive user data, financial transactions, or critical infrastructure components",
      "In regulated industries (healthcare, finance, government) where documented security processes are required for compliance audits",
      "When onboarding a development organization to systematic security practices for the first time",
      "Before a major product launch where a security breach would cause significant reputational or financial damage"
    ],
    "when_to_use_zh": [
      "在构建处理敏感用户数据、金融交易或关键基础设施组件的软件时",
      "在合规审计要求记录安全流程的受监管行业（医疗、金融、政府）中",
      "首次让开发组织采用系统化安全实践时",
      "在重大产品发布前，安全漏洞将造成重大声誉或财务损失时"
    ],
    "core_concepts": [
      "Bug Bar: A minimum severity threshold that determines which security bugs must be fixed before release, providing consistent risk acceptance criteria",
      "Attack Surface Reduction: Systematically minimizing the code and interfaces exposed to untrusted input, thereby reducing the number of pathways an attacker can exploit",
      "Banned Functions: A list of APIs and library functions with known security weaknesses (e.g., strcpy, gets) that developers are prohibited from using",
      "Final Security Review (FSR): A gate-based review performed by a security team before every product release to confirm all SDL requirements have been met",
      "Trusted Computing Base (TCB): The subset of system components that must be correct for the security policy to be enforced, kept as small as possible"
    ],
    "core_concepts_zh": [
      "缺陷基准：确定哪些安全缺陷必须在发布前修复的最低严重性阈值，提供一致的风险接受标准",
      "攻击面缩减：系统性地最小化暴露给不受信任输入的代码和接口，从而减少攻击者可利用的路径数量",
      "禁止函数：开发人员被禁止使用的具有已知安全弱点的API和库函数列表（如strcpy、gets）",
      "最终安全审查（FSR）：每次产品发布前由安全团队执行的基于门控的审查，确认满足所有SDL要求",
      "可信计算基（TCB）：为执行安全策略而必须正确的系统组件子集，尽可能保持最小化"
    ],
    "timeline": [
      [
        "2002",
        "Microsoft launches Trustworthy Computing initiative following Bill Gates' famous memo, triggering SDL development"
      ],
      [
        "2004",
        "Microsoft mandates SDL across all Windows Server and developer tools product groups"
      ],
      [
        "2006",
        "Howard and Lipner publish「The Security Development Lifecycle」, making the process publicly available"
      ],
      [
        "2012",
        "Microsoft releases SDL as an Agile-compatible process and publishes SDL Threat Modeling Tool v3"
      ]
    ],
    "timeline_zh": [
      [
        "2002",
        "微软在比尔·盖茨著名备忘录之后启动「可信计算」计划，触发SDL的开发"
      ],
      [
        "2004",
        "微软在所有Windows Server和开发者工具产品组强制推行SDL"
      ],
      [
        "2006",
        "Howard和Lipner出版「The Security Development Lifecycle」，使该流程公开可用"
      ],
      [
        "2012",
        "微软将SDL发布为敏捷兼容流程，并发布SDL威胁建模工具v3"
      ]
    ],
    "dos": [
      "Integrate SDL requirements as acceptance criteria in user stories so security is a first-class definition of done, not a separate audit",
      "Automate as much of SDL as possible — static analysis, dependency scanning, and secret detection run every commit, making security invisible friction",
      "Use metrics (mean time to fix security bugs, SAST false-positive rate) to continuously improve the SDL process itself",
      "Train developers on the specific vulnerabilities most relevant to your stack rather than generic security awareness content"
    ],
    "dos_zh": [
      "将SDL需求作为用户故事的验收标准，使安全成为一等完成定义，而非独立审计",
      "尽可能自动化SDL——每次提交时运行静态分析、依赖扫描和密钥检测，使安全成为无摩擦的存在",
      "使用指标（安全缺陷平均修复时间、SAST误报率）持续改进SDL流程本身",
      "针对最与你的技术栈相关的特定漏洞培训开发人员，而非通用安全意识内容"
    ],
    "donts": [
      "Don't treat SDL as a one-time checklist at the end of a project because late-stage security fixes cost 30x more than design-phase fixes",
      "Don't assign SDL compliance solely to a security team because security is a shared responsibility that requires developer ownership",
      "Don't skip threat modeling for 'small' features because attackers routinely exploit seemingly minor changes that alter trust boundaries",
      "Don't implement SDL without executive sponsorship because security process adoption fails without organizational authority to enforce bug bars"
    ],
    "donts_zh": [
      "不要将SDL视为项目结束时的一次性检查表，因为后期阶段的安全修复成本比设计阶段高30倍",
      "不要将SDL合规工作仅分配给安全团队，因为安全是需要开发人员承担所有权的共同责任",
      "不要跳过「小」功能的威胁建模，因为攻击者经常利用改变信任边界的看似微小的变更",
      "不要在没有高管支持的情况下实施SDL，因为没有组织权威来执行缺陷基准，安全流程采用就会失败"
    ],
    "case_study_company": "Microsoft",
    "case_study": "After the 2002 Trustworthy Computing initiative, Microsoft halted Windows Server 2003 development for 60 days to train 8,500 developers on secure coding and conduct the first company-wide SDL review. The result was a 45% reduction in security bulletins between Windows XP and Windows Vista, and a 91% reduction in critical vulnerabilities in Windows Server 2003 compared to Windows Server 2000. Microsoft later open-sourced the SDL process and tooling, enabling organizations like Adobe, Cisco, and SAP to adopt it. The SDL became the template for NIST SP 800-218 (Secure Software Development Framework) and ISO/IEC 27034.",
    "case_study_zh": "在2002年「可信计算」计划之后，微软暂停Windows Server 2003的开发60天，对8,500名开发人员进行安全编码培训并进行首次全公司SDL审查。结果是Windows XP和Windows Vista之间安全公告减少了45%，Windows Server 2003与Windows Server 2000相比关键漏洞减少了91%。微软后来将SDL流程和工具开源，使Adobe、Cisco和SAP等组织能够采用它。SDL成为NIST SP 800-218（安全软件开发框架）和ISO/IEC 27034的模板。",
    "when_not_to_use": [
      "For throwaway prototypes or research spikes where the code will never reach production and security overhead would waste exploration time",
      "In very early-stage startups (pre-product-market-fit) where the overhead of formal SDL phases would prevent shipping at all — adopt a lightweight version first",
      "When the system handles only non-sensitive, publicly available data with no user authentication — full SDL overhead exceeds the actual risk"
    ],
    "when_not_to_use_zh": [
      "对于一次性原型或研究性尝试，代码永远不会进入生产环境，安全开销会浪费探索时间",
      "在极早期创业公司（产品市场契合前），正式SDL阶段的开销会完全妨碍发布——先采用精简版本",
      "当系统仅处理非敏感、公开可用的数据且没有用户认证时——完整SDL开销超过实际风险"
    ],
    "adopters": [
      "Microsoft",
      "Adobe",
      "Cisco",
      "SAP",
      "Siemens",
      "US Department of Defense (DoD)"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "security",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Howard, M. & Lipner, S. (2006). \"The Security Development Lifecycle\". Microsoft Press.",
    "secondary_sources": [
      "NIST (2022). \"SP 800-218: Secure Software Development Framework (SSDF)\". NIST.",
      "Shostack, A. (2014). \"Threat Modeling: Designing for Security\". Wiley.",
      "Microsoft (2012). \"SDL: Agile Development\". microsoft.com/en-us/securityengineering/sdl/agile."
    ],
    "typed_relations": [
      {
        "slug": "threat-modeling-stride",
        "type": "complement"
      },
      {
        "slug": "owasp-top-10",
        "type": "complement"
      },
      {
        "slug": "devsecops-pipeline",
        "type": "related"
      },
      {
        "slug": "defense-in-depth",
        "type": "complement"
      }
    ]
  },
  {
    "id": 257,
    "name": "NIST Cybersecurity Framework",
    "name_zh": "NIST网络安全框架",
    "slug": "nist-cybersecurity-framework",
    "category": "security",
    "desc": "A voluntary risk-based framework organizing cybersecurity activities into five concurrent functions: Identify, Protect, Detect, Respond, Recover",
    "desc_zh": "一个基于风险的自愿性框架，将网络安全活动组织为五个并发功能：识别、保护、检测、响应、恢复",
    "steps": [
      "Identify: Develop organizational understanding of systems, assets, data, and capabilities; conduct risk assessments and establish a governance structure for cybersecurity",
      "Protect: Implement safeguards for critical services — access control, awareness training, data security, maintenance, and protective technology deployment",
      "Detect: Define activities to identify cybersecurity events through continuous monitoring, anomaly detection, and security event logging",
      "Respond: Develop and implement response plans, coordinate communications, perform analysis, execute mitigation activities, and conduct post-incident reviews",
      "Recover: Identify and prioritize recovery activities, implement improvements based on lessons learned, and coordinate restoration of services and communications"
    ],
    "steps_zh": [
      "识别：建立对系统、资产、数据和能力的组织理解；进行风险评估并建立网络安全治理结构",
      "保护：为关键服务实施保护措施——访问控制、意识培训、数据安全、维护和保护技术部署",
      "检测：通过持续监控、异常检测和安全事件日志记录定义识别网络安全事件的活动",
      "响应：制定和实施响应计划，协调沟通，执行分析，实施缓解活动，并进行事后审查",
      "恢复：识别和优先处理恢复活动，基于经验教训实施改进，协调服务和通信的恢复"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "Identify",
      "Protect",
      "Detect",
      "Respond",
      "Recover"
    ],
    "viz_labels_zh": [
      "识别",
      "保护",
      "检测",
      "响应",
      "恢复"
    ],
    "related": [
      "defense-in-depth",
      "incident-response-playbook",
      "security-development-lifecycle",
      "zero-trust-architecture"
    ],
    "tags": [
      "nist",
      "risk-management",
      "governance",
      "compliance",
      "cybersecurity"
    ],
    "origin_author": "NIST",
    "origin_source": "NIST (2014). \"Framework for Improving Critical Infrastructure Cybersecurity\". NIST, Gaithersburg, MD.",
    "origin_source_zh": "NIST（2014）。「改善关键基础设施网络安全框架」。NIST，盖瑟斯堡，马里兰州。",
    "complexity": "intermediate",
    "when_to_use": [
      "When an organization needs a common language and structure to communicate cybersecurity risk to executives, boards, and regulators",
      "During cybersecurity program maturity assessments to identify gaps between current and target security postures",
      "In critical infrastructure sectors (energy, finance, healthcare) where the framework was originally designed and has regulatory recognition",
      "When integrating cybersecurity requirements across a complex supply chain with multiple vendors and partners"
    ],
    "when_to_use_zh": [
      "当组织需要通用语言和结构向高管、董事会和监管机构传达网络安全风险时",
      "在网络安全计划成熟度评估期间，识别当前和目标安全态势之间的差距",
      "在关键基础设施领域（能源、金融、医疗），框架最初为此设计并获得监管认可",
      "在跨多个供应商和合作伙伴的复杂供应链中整合网络安全要求时"
    ],
    "core_concepts": [
      "Framework Core: Five concurrent and continuous functions (Identify, Protect, Detect, Respond, Recover) each subdivided into Categories and Subcategories with informative references",
      "Implementation Tiers: Four tiers (Partial, Risk Informed, Repeatable, Adaptive) describing the degree to which cybersecurity risk management practices exhibit key characteristics",
      "Framework Profile: An organization's unique alignment of requirements, objectives, and resources against the Core, representing current state or desired target state",
      "Risk-Based Approach: Prioritization of cybersecurity investments based on business impact and likelihood of threats rather than compliance checkboxes",
      "Informative References: Mappings to existing standards (ISO 27001, COBIT, NIST SP 800-53) enabling organizations to use the CSF as a Rosetta Stone across frameworks"
    ],
    "core_concepts_zh": [
      "框架核心：五个并发持续的功能（识别、保护、检测、响应、恢复），每个功能细分为带有参考信息的类别和子类别",
      "实施层级：四个层级（部分、风险知情、可重复、自适应），描述网络安全风险管理实践展示关键特征的程度",
      "框架概要：组织的需求、目标和资源与核心的独特对齐，代表当前状态或期望的目标状态",
      "基于风险的方法：根据业务影响和威胁可能性而非合规清单来优先考虑网络安全投资",
      "参考信息：映射到现有标准（ISO 27001、COBIT、NIST SP 800-53），使组织能够将CSF用作跨框架的通用语言"
    ],
    "timeline": [
      [
        "2013",
        "Executive Order 13636 directs NIST to develop a cybersecurity framework for critical infrastructure following major grid attacks"
      ],
      [
        "2014",
        "NIST CSF v1.0 published after extensive public-private collaboration with over 3,000 stakeholders"
      ],
      [
        "2018",
        "NIST CSF v1.1 adds supply chain risk management subcategories and clarifies authentication guidance"
      ],
      [
        "2024",
        "NIST CSF v2.0 published, adding a sixth function (Govern) and expanding scope beyond critical infrastructure to all organizations"
      ]
    ],
    "timeline_zh": [
      [
        "2013",
        "13636号行政令指示NIST在主要电网攻击后为关键基础设施制定网络安全框架"
      ],
      [
        "2014",
        "NIST CSF v1.0在与3,000多个利益相关者广泛公私合作后发布"
      ],
      [
        "2018",
        "NIST CSF v1.1增加供应链风险管理子类别并澄清认证指南"
      ],
      [
        "2024",
        "NIST CSF v2.0发布，增加第六个功能（治理）并将范围从关键基础设施扩展到所有组织"
      ]
    ],
    "dos": [
      "Use the Framework Profile to create a gap analysis between current and target states, then roadmap investments to close prioritized gaps",
      "Map your existing controls to CSF subcategories before buying new tools — most mature organizations already cover 60-70% of the framework",
      "Engage executives with Tier assessments rather than technical subcategory details, as Tiers communicate maturity in business language",
      "Revisit your Framework Profile annually or after major architecture changes to keep the risk picture current"
    ],
    "dos_zh": [
      "使用框架概要在当前状态和目标状态之间创建差距分析，然后制定路线图投资以缩小优先差距",
      "在购买新工具之前将现有控制措施映射到CSF子类别——大多数成熟组织已经覆盖框架的60-70%",
      "用层级评估而非技术子类别细节与高管互动，因为层级以商业语言传达成熟度",
      "每年或在重大架构变更后重新审视框架概要，以保持风险图景的时效性"
    ],
    "donts": [
      "Don't treat CSF as a compliance checklist — it is a risk management tool and every organization's target profile should reflect its unique risk appetite",
      "Don't attempt to reach Tier 4 (Adaptive) across all functions simultaneously because the resource cost is prohibitive and most organizations benefit more from closing Tier 1 gaps first",
      "Don't use CSF in isolation from other frameworks — it is explicitly designed to coexist with ISO 27001, SOC 2, and sector-specific regulations",
      "Don't skip the Identify function to jump to Protect — you cannot protect assets you haven't inventoried and risk-rated"
    ],
    "donts_zh": [
      "不要将CSF视为合规清查清单——它是一个风险管理工具，每个组织的目标概要应反映其独特的风险偏好",
      "不要尝试在所有功能上同时达到第4层（自适应），因为资源成本过高，大多数组织从首先弥补第1层差距中受益更多",
      "不要单独使用CSF——它明确设计为与ISO 27001、SOC 2和特定行业法规共存",
      "不要跳过识别功能直接跳到保护——你无法保护未清点和评级风险的资产"
    ],
    "case_study_company": "JPMorgan Chase",
    "case_study": "JPMorgan Chase adopted the NIST CSF following the 2014 data breach that exposed 76 million households' data. The bank used the Framework Profile to map their existing security controls across all five functions, identifying critical gaps in the Detect and Respond categories. They created a target profile aligned with Tier 3 (Repeatable) capabilities across all functions and Tier 4 (Adaptive) for their most critical trading and customer-data systems. The CSF provided a shared language that allowed the CISO to present cybersecurity investment needs to the board as business risk rather than technical jargon. JPMC now spends over $600M annually on cybersecurity and credits the structured CSF approach with enabling board-level prioritization of that investment.",
    "case_study_zh": "摩根大通在2014年数据泄露事件（泄露7,600万户家庭数据）后采用了NIST CSF。该银行使用框架概要映射所有五个功能中的现有安全控制措施，识别出检测和响应类别中的关键差距。他们创建了一个目标概要，在所有功能上与第3层（可重复）能力对齐，在最关键的交易和客户数据系统上与第4层（自适应）对齐。CSF提供了一种共同语言，允许CISO向董事会以业务风险而非技术术语呈现网络安全投资需求。摩根大通现在每年在网络安全上花费超过6亿美元，并将投资的董事会级优先排序归功于结构化的CSF方法。",
    "when_not_to_use": [
      "As a technical implementation guide — CSF describes what to do, not how; supplement it with NIST SP 800-53 or CIS Controls for prescriptive controls",
      "For very small organizations (under 50 employees) where the overhead of formal profile creation and tier assessment exceeds their entire security budget",
      "As a substitute for sector-specific regulations (PCI-DSS, HIPAA) — use CSF as a unifying overlay, not a replacement"
    ],
    "when_not_to_use_zh": [
      "作为技术实施指南——CSF描述做什么，而非如何做；用NIST SP 800-53或CIS控制措施补充规定性控制",
      "对于非常小的组织（50人以下），正式概要创建和层级评估的开销超过其整个安全预算",
      "作为特定行业法规（PCI-DSS、HIPAA）的替代——将CSF用作统一覆盖层，而非替代品"
    ],
    "adopters": [
      "JPMorgan Chase",
      "Verizon",
      "Intel",
      "Boeing",
      "US Department of Energy",
      "Anthem"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "security",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "NIST (2024). \"Cybersecurity Framework 2.0\". doi.org/10.6028/NIST.CSWP.29.",
    "secondary_sources": [
      "NIST (2018). \"Framework for Improving Critical Infrastructure Cybersecurity v1.1\". NIST.",
      "Calder, A. (2018). \"NIST Cybersecurity Framework: A Pocket Guide\". IT Governance Publishing.",
      "CIS (2021). \"CIS Controls v8 Mapping to NIST CSF\". cisecurity.org."
    ],
    "typed_relations": [
      {
        "slug": "defense-in-depth",
        "type": "complement"
      },
      {
        "slug": "incident-response-playbook",
        "type": "complement"
      },
      {
        "slug": "security-development-lifecycle",
        "type": "related"
      },
      {
        "slug": "threat-modeling-stride",
        "type": "complement"
      }
    ]
  },
  {
    "id": 258,
    "name": "DevSecOps Pipeline",
    "name_zh": "DevSecOps流水线",
    "slug": "devsecops-pipeline",
    "category": "security",
    "desc": "The integration of security tooling and culture into every stage of a CI/CD pipeline so that security is automated, continuous, and developer-owned rather than a final gate",
    "desc_zh": "将安全工具和文化集成到CI/CD流水线的每个阶段，使安全成为自动化、持续且由开发人员负责的工作，而非最终关卡",
    "steps": [
      "Shift-Left Planning: Embed security requirements in user stories, run automated dependency vulnerability scanning (SCA) on every pull request, and configure secret detection to block credentials from entering source control",
      "Secure Build: Integrate SAST tools (Semgrep, Checkmarx, SonarQube) into the CI pipeline as blocking quality gates with developer-visible feedback within minutes of commit",
      "Container and Infrastructure Security: Scan container images with tools like Trivy or Snyk in the registry pipeline, enforce policy-as-code (OPA, Conftest) for Kubernetes manifests and Terraform before deployment",
      "Dynamic and Runtime Testing: Run DAST (OWASP ZAP, Burp Suite API) against staging environments in the CD pipeline, enable runtime application self-protection (RASP) and cloud workload protection in production",
      "Continuous Feedback and Governance: Aggregate findings into a security dashboard (Defect Dojo, Veracode), route high-severity findings to developer queues, track mean time to remediation, and conduct blameless security post-mortems"
    ],
    "steps_zh": [
      "左移规划：在用户故事中嵌入安全需求，在每个拉取请求上运行自动化依赖漏洞扫描（SCA），配置密钥检测以阻止凭证进入源代码管理",
      "安全构建：将SAST工具（Semgrep、Checkmarx、SonarQube）集成到CI流水线中作为阻塞质量门控，在提交后数分钟内向开发人员提供可见反馈",
      "容器和基础设施安全：在注册表流水线中使用Trivy或Snyk等工具扫描容器镜像，在部署前对Kubernetes清单和Terraform强制执行策略即代码（OPA、Conftest）",
      "动态和运行时测试：在CD流水线的暂存环境中运行DAST（OWASP ZAP、Burp Suite API），在生产中启用运行时应用程序自我保护（RASP）和云工作负载保护",
      "持续反馈和治理：将发现汇总到安全仪表板（Defect Dojo、Veracode），将高严重性发现路由到开发人员队列，跟踪平均修复时间，并进行无指责安全事后审查"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Shift-Left",
      "Secure Build",
      "Container Security",
      "DAST Runtime",
      "Feedback Loop"
    ],
    "viz_labels_zh": [
      "左移安全",
      "安全构建",
      "容器安全",
      "动态测试",
      "持续反馈"
    ],
    "related": [
      "security-development-lifecycle",
      "owasp-top-10",
      "threat-modeling-stride",
      "defense-in-depth"
    ],
    "tags": [
      "devsecops",
      "ci-cd",
      "shift-left",
      "automation",
      "sast",
      "dast"
    ],
    "origin_author": "Shannon Lietz",
    "origin_source": "Lietz, S. (2012). \"DevSecOps: The DevOps Evolution\". devopsagenda.techtarget.com; Gartner (2016). \"DevSecOps: How to Seamlessly Integrate Security Into DevOps\".",
    "origin_source_zh": "Lietz, S.（2012）。「DevSecOps：DevOps的演进」。devopsagenda.techtarget.com；Gartner（2016）。「DevSecOps：如何将安全无缝集成到DevOps中」。",
    "complexity": "advanced",
    "when_to_use": [
      "When a team deploys multiple times per day and traditional security review gates create unacceptable bottlenecks in the delivery pipeline",
      "After a production security incident caused by a vulnerability that existed in code for weeks or months before discovery",
      "When building cloud-native or microservices architectures where the attack surface changes with every deployment",
      "In organizations adopting platform engineering — security tooling becomes a product offering of the internal developer platform"
    ],
    "when_to_use_zh": [
      "当团队每天部署多次，传统安全审查关卡在交付流水线中造成不可接受的瓶颈时",
      "在生产安全事故发生后，该事故由发现前已在代码中存在数周或数月的漏洞引起",
      "在构建云原生或微服务架构时，攻击面随每次部署而变化",
      "在采用平台工程的组织中——安全工具成为内部开发平台的产品服务"
    ],
    "core_concepts": [
      "Shift-Left Security: Moving security activities earlier in the SDLC so defects are caught at their cheapest fix point (design/code) rather than post-deployment",
      "Security as Code: Expressing security policies, compliance rules, and infrastructure hardening as version-controlled code that is tested and deployed like any other software artifact",
      "Continuous Compliance: Automated verification that every deployment meets defined security and regulatory requirements, producing audit trails without manual evidence collection",
      "Developer Security Champions: Developers embedded in teams with additional security training who act as first-line security reviewers, reducing bottlenecks on central security teams",
      "Software Composition Analysis (SCA): Automated scanning of open-source dependencies for known CVEs, license compliance issues, and transitive vulnerabilities in the dependency graph"
    ],
    "core_concepts_zh": [
      "左移安全：将安全活动移至SDLC的早期阶段，以便在最便宜的修复点（设计/代码）而非部署后捕获缺陷",
      "安全即代码：将安全策略、合规规则和基础设施加固表达为受版本控制的代码，像其他软件工件一样进行测试和部署",
      "持续合规：自动化验证每次部署是否满足定义的安全和监管要求，无需手动收集证据即可生成审计跟踪",
      "开发人员安全冠军：嵌入团队的具有额外安全培训的开发人员，充当一线安全审查员，减少对中央安全团队的瓶颈",
      "软件组合分析（SCA）：自动扫描开源依赖项，检查已知CVE、许可证合规问题和依赖图中的传递性漏洞"
    ],
    "timeline": [
      [
        "2012",
        "Shannon Lietz coins the term 「DevSecOps」 and launches the DevSecOps Manifesto to formalize security integration in DevOps"
      ],
      [
        "2016",
        "Gartner publishes「DevSecOps: How to Seamlessly Integrate Security Into DevOps」, driving enterprise adoption"
      ],
      [
        "2018",
        "GitLab launches built-in SAST/DAST/dependency scanning in its CI pipelines, making DevSecOps accessible out-of-the-box"
      ],
      [
        "2021",
        "Log4Shell vulnerability demonstrates DevSecOps SCA value as organizations with automated dependency scanning patched within hours versus weeks"
      ]
    ],
    "timeline_zh": [
      [
        "2012",
        "Shannon Lietz创造「DevSecOps」一词并发布DevSecOps宣言，以正式化DevOps中的安全集成"
      ],
      [
        "2016",
        "Gartner发布「DevSecOps：如何将安全无缝集成到DevOps中」，推动企业采用"
      ],
      [
        "2018",
        "GitLab在其CI流水线中推出内置SAST/DAST/依赖扫描，使DevSecOps开箱即用"
      ],
      [
        "2021",
        "Log4Shell漏洞证明了DevSecOps SCA的价值，具有自动化依赖扫描的组织在数小时内完成修补，而非数周"
      ]
    ],
    "dos": [
      "Start with secret detection and dependency scanning as they deliver immediate value with near-zero false positives and prevent the most common production incidents",
      "Tune SAST tools per-project to reduce false-positive rates below 10% before enabling them as blocking gates — high noise kills developer trust",
      "Give developers fix guidance alongside findings because a vulnerability alert without a remediation path creates anxiety, not security improvement",
      "Measure developer experience metrics (time to fix, false-positive rate, pipeline overhead) alongside security metrics to maintain developer buy-in"
    ],
    "dos_zh": [
      "从密钥检测和依赖扫描开始，因为它们以几乎零误报提供即时价值，防止最常见的生产事故",
      "在将SAST工具作为阻塞关卡启用之前，对每个项目进行调优以将误报率降至10%以下——高噪音会摧毁开发人员信任",
      "在发现旁边给开发人员提供修复指导，因为没有补救路径的漏洞警报会产生焦虑，而非安全改进",
      "与安全指标一起衡量开发人员体验指标（修复时间、误报率、流水线开销），以保持开发人员的认同"
    ],
    "donts": [
      "Don't add every available security scanner at once — pipeline sprawl with conflicting tool outputs overwhelms developers and slows delivery without security benefit",
      "Don't treat all findings as equally urgent — a CVSS 9.8 in a production API deserves different SLA treatment than a CVSS 3.2 in a development dependency",
      "Don't skip the cultural transformation — tooling without developer security ownership reverts to a security-team-as-gatekeeper model that DevSecOps was designed to replace",
      "Don't ignore runtime security in favor of pre-deployment scanning only because attackers exploit running systems, not source code"
    ],
    "donts_zh": [
      "不要一次添加所有可用的安全扫描器——具有冲突工具输出的流水线扩张会压倒开发人员并减慢交付，而没有安全收益",
      "不要将所有发现视为同等紧迫——生产API中的CVSS 9.8与开发依赖中的CVSS 3.2应得到不同的SLA处理",
      "不要跳过文化转型——没有开发人员安全所有权的工具化会恢复到DevSecOps旨在替代的安全团队充当看门人模式",
      "不要仅偏爱部署前扫描而忽视运行时安全，因为攻击者利用运行中的系统，而非源代码"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix's Security Monkey and later Repokid projects exemplify their DevSecOps philosophy of automating security at scale. With 700+ microservices deploying thousands of times weekly, manual security review is impossible. Netflix built Security Monkey to continuously audit AWS IAM policies and S3 bucket permissions, automatically flagging and eventually auto-remediating policy drift. Their Repokid tool uses machine learning on CloudTrail logs to detect and remove unused IAM permissions, enforcing least-privilege automatically over time. This automated, developer-owned approach enabled Netflix to maintain a strong security posture across millions of lines of code without a security team that scales with the engineering headcount.",
    "case_study_zh": "Netflix的Security Monkey以及后来的Repokid项目体现了其大规模自动化安全的DevSecOps理念。每周部署数千次的700多个微服务使手动安全审查变得不可能。Netflix构建了Security Monkey来持续审计AWS IAM策略和S3存储桶权限，自动标记并最终自动修复策略漂移。他们的Repokid工具使用机器学习分析CloudTrail日志来检测和删除未使用的IAM权限，随着时间推移自动执行最小权限。这种自动化、开发人员拥有的方法使Netflix能够在数百万行代码中保持强大的安全态势，而无需随工程人员规模扩展的安全团队。",
    "when_not_to_use": [
      "For systems with very infrequent releases (quarterly or annual) where the overhead of pipeline security tooling exceeds the risk reduction — manual security reviews are more cost-effective",
      "When the team lacks the DevOps maturity for CI/CD itself — implement basic pipeline practices before layering security automation on top",
      "In air-gapped classified environments where commercial security scanning SaaS tools cannot be used and open-source alternatives require significant maintenance overhead"
    ],
    "when_not_to_use_zh": [
      "对于发布非常不频繁（季度或年度）的系统，流水线安全工具的开销超过风险降低——手动安全审查更具成本效益",
      "当团队缺乏CI/CD本身的DevOps成熟度时——在其上叠加安全自动化之前先实施基本流水线实践",
      "在商业安全扫描SaaS工具无法使用且开源替代方案需要大量维护开销的隔离机密环境中"
    ],
    "adopters": [
      "Netflix",
      "Etsy",
      "Capital One",
      "Spotify",
      "Atlassian",
      "GitLab"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "security",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Kim, G., Humble, J., Debois, P. & Willis, J. (2016). \"The DevOps Handbook\". IT Revolution Press.",
    "secondary_sources": [
      "OWASP (2020). \"DevSecOps Guideline\". owasp.org/www-project-devsecops-guideline.",
      "Gartner (2016). \"DevSecOps: How to Seamlessly Integrate Security Into DevOps\". Gartner Research.",
      "NIST (2022). \"SP 800-218: Secure Software Development Framework\". NIST."
    ],
    "typed_relations": [
      {
        "slug": "security-development-lifecycle",
        "type": "related"
      },
      {
        "slug": "owasp-top-10",
        "type": "complement"
      },
      {
        "slug": "threat-modeling-stride",
        "type": "complement"
      },
      {
        "slug": "defense-in-depth",
        "type": "complement"
      }
    ]
  },
  {
    "id": 259,
    "name": "Data Loss Prevention (DLP)",
    "name_zh": "数据防泄漏（DLP）",
    "slug": "data-loss-prevention",
    "category": "security",
    "desc": "A strategy and toolset for detecting, monitoring, and preventing the unauthorized transmission of sensitive data outside organizational boundaries",
    "desc_zh": "检测、监控和防止敏感数据未经授权传输到组织边界之外的策略和工具集",
    "steps": [
      "Data Discovery and Classification: Inventory all data stores (databases, file shares, cloud storage, endpoints) and classify content by sensitivity level (public, internal, confidential, restricted) using automated scanning and pattern matching",
      "Policy Definition: Define DLP rules aligned to business risk — patterns for PII (SSN, credit card, passport), PHI, intellectual property, and source code, specifying monitor, alert, or block actions by data type and channel",
      "Channel Coverage: Deploy DLP controls across all exfiltration vectors — email (O365 DLP, Proofpoint), endpoint (Symantec DLP, CrowdStrike), cloud (CASB — Netskope, MCAS), and network (proxy inspection)",
      "Incident Response Integration: Route DLP alerts to a SIEM with enriched context (user risk score, data sensitivity, destination), define escalation workflows, and integrate with HR and legal for insider-threat cases",
      "Tuning and Governance: Measure false-positive rates per policy, tune regex patterns and ML classifiers, conduct quarterly reviews of classification schema against evolving data landscape, and report DLP effectiveness to security leadership"
    ],
    "steps_zh": [
      "数据发现和分类：清点所有数据存储（数据库、文件共享、云存储、端点）并使用自动扫描和模式匹配按敏感级别（公开、内部、机密、受限）对内容进行分类",
      "策略定义：定义与业务风险对齐的DLP规则——PII（社会安全号、信用卡、护照）、PHI、知识产权和源代码的模式，按数据类型和渠道指定监控、告警或阻止操作",
      "渠道覆盖：在所有泄漏向量上部署DLP控制——电子邮件（O365 DLP、Proofpoint）、端点（Symantec DLP、CrowdStrike）、云（CASB——Netskope、MCAS）和网络（代理检查）",
      "事件响应集成：将DLP告警路由到带有丰富上下文（用户风险评分、数据敏感性、目的地）的SIEM，定义升级工作流，并与HR和法律部门集成处理内部威胁案例",
      "调优和治理：测量每个策略的误报率，调整正则表达式模式和ML分类器，对照不断演变的数据格局进行季度分类模式审查，并向安全领导层报告DLP有效性"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Data Discovery",
      "Policy Define",
      "Channel Coverage",
      "IR Integration",
      "Tuning"
    ],
    "viz_labels_zh": [
      "数据发现",
      "策略定义",
      "通道覆盖",
      "事件响应",
      "规则调优"
    ],
    "related": [
      "defense-in-depth",
      "nist-cybersecurity-framework",
      "zero-trust-architecture",
      "threat-modeling-stride"
    ],
    "tags": [
      "dlp",
      "data-protection",
      "insider-threat",
      "compliance",
      "pii"
    ],
    "origin_author": "Gartner",
    "origin_source": "Gartner (2006). \"Magic Quadrant for Data Loss Prevention\". Gartner Research; Securosis (2007). \"Understanding and Selecting a DLP Solution\".",
    "origin_source_zh": "Gartner（2006）。「数据防泄漏魔力象限」。Gartner研究；Securosis（2007）。「理解和选择DLP解决方案」。",
    "complexity": "advanced",
    "when_to_use": [
      "In regulated industries (finance, healthcare, legal) where data residency and exfiltration prevention are mandated by PCI-DSS, HIPAA, GDPR, or SOX",
      "When insider threat is a significant risk — departing employees, contractors with broad access, or mergers and acquisitions where data sovereignty matters",
      "After a data breach investigation reveals exfiltration occurred via email, USB, or cloud sync without detection",
      "When a cloud migration moves sensitive data to SaaS platforms (Salesforce, O365) requiring CASB-integrated DLP to maintain visibility"
    ],
    "when_to_use_zh": [
      "在PCI-DSS、HIPAA、GDPR或SOX强制要求数据驻留和防泄漏的受监管行业（金融、医疗、法律）",
      "当内部威胁是重大风险时——离职员工、拥有广泛访问权限的承包商，或数据主权重要的并购情况",
      "在数据泄露调查发现通过电子邮件、USB或云同步未被检测到的泄漏发生后",
      "当云迁移将敏感数据移至SaaS平台（Salesforce、O365），需要集成CASB的DLP来保持可见性时"
    ],
    "core_concepts": [
      "Data Classification: The categorization of data by sensitivity, regulatory obligation, and business value, which determines the DLP policy applied to each data asset",
      "Content Inspection: Deep examination of data in motion (network), at rest (storage), and in use (endpoint) using regular expressions, data fingerprinting, and ML classifiers",
      "Contextual Analysis: Evaluating not just content but context — who is sending, to where, via what channel, at what time — to reduce false positives and catch anomalous behavior",
      "CASB (Cloud Access Security Broker): A security policy enforcement point between cloud service users and providers that extends DLP visibility to SaaS and IaaS environments",
      "Insider Threat Integration: Combining DLP telemetry with user behavior analytics (UEBA) to distinguish accidental data handling mistakes from malicious exfiltration attempts"
    ],
    "core_concepts_zh": [
      "数据分类：按敏感性、监管义务和业务价值对数据进行分类，决定应用于每个数据资产的DLP策略",
      "内容检查：使用正则表达式、数据指纹和ML分类器深入检查传输中（网络）、静止中（存储）和使用中（端点）的数据",
      "上下文分析：不仅评估内容，还评估上下文——谁在发送、发往何处、通过什么渠道、在什么时间——以减少误报并捕获异常行为",
      "CASB（云访问安全代理）：云服务用户和提供商之间的安全策略执行点，将DLP可见性扩展到SaaS和IaaS环境",
      "内部威胁集成：将DLP遥测与用户行为分析（UEBA）相结合，以区分意外数据处理错误和恶意数据泄漏尝试"
    ],
    "timeline": [
      [
        "2003",
        "Vontu (later acquired by Symantec) launches one of the first commercial DLP products focusing on email and network inspection"
      ],
      [
        "2006",
        "Gartner formally defines the DLP market category, triggering enterprise adoption and vendor consolidation"
      ],
      [
        "2017",
        "Microsoft launches Office 365 DLP natively in the platform, making DLP accessible to organizations without dedicated DLP infrastructure"
      ],
      [
        "2022",
        "CASB-integrated DLP becomes the dominant architecture as SaaS and cloud storage overtake on-premise as the primary data exfiltration surface"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "Vontu（后被Symantec收购）推出首批专注于电子邮件和网络检查的商业DLP产品之一"
      ],
      [
        "2006",
        "Gartner正式定义DLP市场类别，推动企业采用和供应商整合"
      ],
      [
        "2017",
        "微软在平台中原生推出Office 365 DLP，使没有专用DLP基础设施的组织能够访问DLP"
      ],
      [
        "2022",
        "随着SaaS和云存储超越本地部署成为主要数据泄漏面，集成CASB的DLP成为主流架构"
      ]
    ],
    "dos": [
      "Start with monitor-only mode for all new policies for 30 days to measure false-positive rates before enabling blocking actions that could disrupt legitimate workflows",
      "Involve business data owners in classification schema design because IT-defined classification schemas frequently misalign with how business units actually use and value data",
      "Integrate DLP alerts into user awareness at the point of violation — a real-time educational pop-up is more effective than a security team email days later",
      "Build a dedicated DLP tuning team rather than treating DLP as a set-and-forget tool, as data patterns and workflows evolve constantly"
    ],
    "dos_zh": [
      "在启用可能中断合法工作流的阻止操作之前，对所有新策略以仅监控模式运行30天以测量误报率",
      "让业务数据所有者参与分类模式设计，因为IT定义的分类模式经常与业务单元实际使用和评估数据的方式不一致",
      "在违规时将DLP告警整合到用户意识中——实时教育弹出窗口比几天后发来的安全团队电子邮件更有效",
      "建立专门的DLP调优团队，而非将DLP视为一次性设置工具，因为数据模式和工作流不断演变"
    ],
    "donts": [
      "Don't attempt to classify and protect all data at once — a phased approach starting with the highest-sensitivity data (PCI, PHI, IP) prevents program failure from scope overload",
      "Don't rely solely on regex-based detection for modern DLP because attackers obfuscate sensitive data (steganography, encoding) and ML-based contextual analysis is essential",
      "Don't deploy endpoint DLP without change management and employee communication — covert monitoring without transparency creates legal risk in many jurisdictions",
      "Don't treat DLP as a substitute for encryption — DLP detects exfiltration attempts but encryption ensures data is useless if exfiltrated"
    ],
    "donts_zh": [
      "不要试图一次分类和保护所有数据——从最高敏感度数据（PCI、PHI、IP）开始的分阶段方法可防止因范围过载导致计划失败",
      "不要仅依赖基于正则表达式的检测来进行现代DLP，因为攻击者会混淆敏感数据（隐写术、编码），基于ML的上下文分析是必不可少的",
      "不要在没有变更管理和员工沟通的情况下部署端点DLP——在许多司法管辖区，没有透明度的隐蔽监控会产生法律风险",
      "不要将DLP视为加密的替代品——DLP检测泄漏尝试，但加密确保即使数据被泄漏也毫无用处"
    ],
    "case_study_company": "Morgan Stanley",
    "case_study": "In 2019, Morgan Stanley faced a significant insider threat incident when a financial advisor exfiltrated data belonging to approximately 900 clients before departing. The bank's DLP system detected the anomalous bulk download from a wealth management system and flagged it within hours, enabling rapid legal response. The incident led Morgan Stanley to overhaul its DLP program with enhanced UEBA integration, tighter controls on bulk data exports, and a dedicated insider threat team. The UEBA-DLP integration reduced mean time to detect insider exfiltration from 78 days (industry average) to under 48 hours for high-risk departing employees, and the program became a benchmark for financial services sector DLP implementation.",
    "case_study_zh": "2019年，摩根士丹利面临严重的内部威胁事件，一名理财顾问在离职前泄漏了约900名客户的数据。该银行的DLP系统在数小时内检测到从财富管理系统异常批量下载的行为并标记，从而实现快速法律响应。该事件促使摩根士丹利彻底改革其DLP计划，增强UEBA集成，对批量数据导出实施更严格的控制，并建立专门的内部威胁团队。UEBA-DLP集成将内部泄漏的平均检测时间从78天（行业平均）缩短至高风险离职员工的48小时以内，该计划成为金融服务行业DLP实施的标杆。",
    "when_not_to_use": [
      "As a replacement for a data minimization strategy — the best DLP control is not collecting sensitive data in the first place",
      "For organizations processing only non-sensitive, publicly available data where classification overhead exceeds any risk reduction value",
      "As the only insider threat control — DLP must be layered with privileged access management (PAM), zero-trust network access, and HR process controls"
    ],
    "when_not_to_use_zh": [
      "作为数据最小化策略的替代——最好的DLP控制是首先不收集敏感数据",
      "对于仅处理非敏感公开可用数据的组织，分类开销超过任何风险降低价值",
      "作为唯一的内部威胁控制——DLP必须与特权访问管理（PAM）、零信任网络访问和HR流程控制分层使用"
    ],
    "adopters": [
      "Morgan Stanley",
      "Bank of America",
      "Pfizer",
      "Lockheed Martin",
      "US Department of Defense",
      "Deutsche Bank"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "security"
    ],
    "maturity_ring": "established",
    "primary_source": "Securosis (2007). \"Understanding and Selecting a DLP Solution\". securosis.com.",
    "secondary_sources": [
      "Gartner (2022). \"Magic Quadrant for Data Loss Prevention\". Gartner Research.",
      "NIST (2018). \"SP 800-188: De-Identification of Government Datasets\". NIST.",
      "Cole, E. (2017). \"Advanced Persistent Threat: Understanding the Danger and How to Protect Your Organization\". Syngress."
    ],
    "typed_relations": [
      {
        "slug": "defense-in-depth",
        "type": "complement"
      },
      {
        "slug": "nist-cybersecurity-framework",
        "type": "complement"
      },
      {
        "slug": "threat-modeling-stride",
        "type": "complement"
      },
      {
        "slug": "incident-response-playbook",
        "type": "related"
      }
    ]
  },
  {
    "id": 260,
    "name": "Incident Response Playbook",
    "name_zh": "事件响应手册",
    "slug": "incident-response-playbook",
    "category": "security",
    "desc": "SANS Institute's six-step structured process for handling cybersecurity incidents from preparation through post-incident lessons learned",
    "desc_zh": "SANS研究所的六步结构化流程，用于处理从准备到事后经验总结的网络安全事件",
    "steps": [
      "Preparation: Establish incident response team (IRT) with defined roles, communication trees, and legal/HR contacts; deploy SIEM, EDR, and forensic tooling; run tabletop exercises quarterly to validate playbooks",
      "Identification: Detect anomalous events through SIEM correlation rules, EDR alerts, threat intelligence feeds, or user reports; triage to confirm the event is a security incident and classify severity (P1-P4)",
      "Containment: Implement short-term containment (isolate affected systems, revoke compromised credentials) followed by long-term containment (patch, rebuild, segment network) without destroying forensic evidence",
      "Eradication: Remove the root cause — malware, backdoors, unauthorized accounts, misconfigured services — and validate that all affected systems are clean through forensic verification",
      "Recovery: Restore services in a controlled sequence, monitor for signs of re-infection, verify business functions, and declare the incident closed only after sustained clean monitoring period"
    ],
    "steps_zh": [
      "准备：建立具有明确角色、通信树和法律/HR联系人的事件响应团队（IRT）；部署SIEM、EDR和取证工具；每季度进行桌面演练以验证手册",
      "识别：通过SIEM关联规则、EDR告警、威胁情报源或用户报告检测异常事件；分类确认该事件为安全事件并分类严重程度（P1-P4）",
      "遏制：实施短期遏制（隔离受影响系统、撤销被入侵凭证），然后是长期遏制（修补、重建、网络分段），同时不破坏取证证据",
      "根除：删除根本原因——恶意软件、后门、未授权账户、配置错误的服务——并通过取证验证确认所有受影响系统已清除",
      "恢复：按受控顺序恢复服务，监控再感染迹象，验证业务功能，仅在持续干净的监控期后宣布事件关闭"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Preparation",
      "Identification",
      "Containment",
      "Eradication",
      "Recovery"
    ],
    "viz_labels_zh": [
      "准备",
      "识别",
      "遏制",
      "消除",
      "恢复"
    ],
    "related": [
      "nist-cybersecurity-framework",
      "defense-in-depth",
      "threat-modeling-stride",
      "data-loss-prevention"
    ],
    "tags": [
      "incident-response",
      "sans",
      "forensics",
      "playbook",
      "breach"
    ],
    "origin_author": "SANS Institute",
    "origin_source": "SANS Institute (2004). \"Incident Handler's Handbook\". SANS Reading Room; Northcutt, S. (2003). \"Computer Security Incident Handling: Step by Step\". SANS Institute.",
    "origin_source_zh": "SANS研究所（2004）。「事件处理人员手册」。SANS阅读室；Northcutt, S.（2003）。「计算机安全事件处理：一步一步」。SANS研究所。",
    "complexity": "intermediate",
    "when_to_use": [
      "Before an incident occurs — playbooks must be built and tested in tabletop exercises before they are needed under pressure",
      "When a security alert is confirmed as a genuine incident requiring coordinated organizational response beyond individual analyst actions",
      "In regulated environments (finance, healthcare, critical infrastructure) where documented incident handling procedures are required for compliance",
      "When standing up a new security operations center (SOC) that needs standardized procedures for consistent incident handling across shifts and analysts"
    ],
    "when_to_use_zh": [
      "在事件发生之前——手册必须在需要在压力下使用之前建立并通过桌面演练进行测试",
      "当安全告警被确认为需要超出单个分析师行动的协调组织响应的真实事件时",
      "在要求记录事件处理程序以实现合规的受监管环境（金融、医疗、关键基础设施）中",
      "在建立需要跨班次和分析师进行一致事件处理的标准化程序的新安全运营中心（SOC）时"
    ],
    "core_concepts": [
      "Dwell Time: The duration between initial compromise and detection, a critical metric revealing detection capability gaps — the industry average is 21 days, top-tier SOCs achieve under 1 day",
      "Chain of Custody: The documented, unbroken record of who accessed forensic evidence and when, required for evidence admissibility in legal proceedings",
      "Indicators of Compromise (IoC): Artifacts of intrusion (IP addresses, file hashes, registry keys, domain names) used to detect and attribute attack activity across the environment",
      "Tabletop Exercise: A discussion-based simulation where team members walk through an incident scenario to test plan completeness, communication paths, and decision authority without real-world system impact",
      "RACI Matrix for IR: A responsibility assignment defining who is Responsible, Accountable, Consulted, and Informed for each step of the playbook, preventing coordination failures under pressure"
    ],
    "core_concepts_zh": [
      "驻留时间：初始入侵与检测之间的时长，是揭示检测能力差距的关键指标——行业平均为21天，顶级SOC可实现1天以内",
      "监管链：记录谁在何时访问取证证据的完整记录，是证据在法律诉讼中可采性的必要条件",
      "入侵指标（IoC）：入侵的痕迹（IP地址、文件哈希、注册表键、域名），用于在整个环境中检测和归因攻击活动",
      "桌面演练：基于讨论的模拟，团队成员演练事件场景以测试计划完整性、通信路径和决策权力，不影响真实系统",
      "IR的RACI矩阵：定义手册每个步骤中谁负责（Responsible）、谁问责（Accountable）、谁咨询（Consulted）、谁知情（Informed）的责任分配，防止在压力下出现协调失败"
    ],
    "timeline": [
      [
        "1991",
        "CERT/CC at Carnegie Mellon publishes foundational incident response guidelines following the Morris Worm incident"
      ],
      [
        "2003",
        "SANS Institute codifies the six-step incident handling process in「Computer Security Incident Handling: Step by Step」"
      ],
      [
        "2012",
        "NIST SP 800-61 Rev 2 formalizes computer security incident handling guidance, aligning with and extending the SANS model"
      ],
      [
        "2020",
        "SolarWinds supply chain attack demonstrates the need for playbooks covering nation-state threat actors and multi-tenant compromise scenarios"
      ]
    ],
    "timeline_zh": [
      [
        "1991",
        "卡内基梅隆大学的CERT/CC在莫里斯蠕虫事件后发布基础事件响应指南"
      ],
      [
        "2003",
        "SANS研究所在「计算机安全事件处理：一步一步」中编纂六步事件处理流程"
      ],
      [
        "2012",
        "NIST SP 800-61 Rev 2正式化计算机安全事件处理指南，与SANS模型对齐并进行扩展"
      ],
      [
        "2020",
        "SolarWinds供应链攻击证明了需要涵盖国家级威胁行为者和多租户入侵场景的手册"
      ]
    ],
    "dos": [
      "Conduct tabletop exercises at least quarterly with realistic scenarios (ransomware, insider threat, supply chain compromise) and include legal, PR, and HR in the exercise — incidents are not just technical events",
      "Pre-authorize containment actions (isolate a server, revoke a service account) to specific roles so responders don't lose hours seeking approval during an active incident",
      "Preserve forensic evidence before containment where possible — memory dumps, network captures, and log snapshots taken before isolation are often more valuable than post-containment forensics",
      "Maintain a war room communication channel (Slack #incident-response, Teams bridge) separate from production systems so a compromised environment doesn't impede communication"
    ],
    "dos_zh": [
      "至少每季度使用现实场景（勒索软件、内部威胁、供应链入侵）进行桌面演练，并在演练中纳入法律、公关和HR——事件不仅仅是技术事件",
      "预先授权特定角色执行遏制操作（隔离服务器、撤销服务账户），以便响应人员在活跃事件中不会浪费数小时寻求批准",
      "在可能的情况下在遏制前保存取证证据——隔离前获取的内存转储、网络捕获和日志快照通常比遏制后取证更有价值",
      "维护独立于生产系统的战情室通信渠道（Slack #incident-response、Teams桥接），以防受感染环境阻碍通信"
    ],
    "donts": [
      "Don't skip the Preparation phase and try to build the playbook during an active incident because decisions made under pressure without pre-defined procedures are error-prone and legally risky",
      "Don't communicate breach details over potentially compromised channels (corporate email, Slack) during containment because attackers monitoring those channels gain advance warning of response actions",
      "Don't eradicate before completing forensic preservation because destroying the attacker's artifacts before imaging affected systems permanently loses evidence needed for root cause analysis and attribution",
      "Don't declare incidents closed prematurely — many attackers establish persistence mechanisms that survive initial eradication; monitor for at least 30 days post-recovery for sophisticated threats"
    ],
    "donts_zh": [
      "不要跳过准备阶段，试图在活跃事件中建立手册，因为在没有预定义程序的压力下做出的决策容易出错且有法律风险",
      "在遏制期间不要通过可能受感染的渠道（公司电子邮件、Slack）传达泄漏细节，因为监控这些渠道的攻击者会提前获得响应行动的警告",
      "不要在完成取证保存之前进行根除，因为在对受影响系统进行镜像之前销毁攻击者的痕迹会永久丢失根本原因分析和归因所需的证据",
      "不要过早宣布事件关闭——许多攻击者建立能够在初始根除后存活的持久化机制；对复杂威胁在恢复后监控至少30天"
    ],
    "case_study_company": "Equifax",
    "case_study": "The 2017 Equifax breach, which exposed 147 million individuals' data, became the canonical case study for incident response failure. Equifax's IR process failed at multiple phases: a misconfigured security scanner missed the Apache Struts vulnerability for 78 days (Identification failure); once detected, the incident was not escalated to the executive team for six days (Communication failure); and eradication was incomplete, with the attacker maintaining access during initial remediation attempts. The subsequent Congressional investigation found that Equifax lacked a formal, tested incident response playbook. Post-breach, Equifax invested $1.4B in security overhaul including a formal IR program with quarterly tabletop exercises, pre-authorized containment procedures, and an external forensics retainer.",
    "case_study_zh": "2017年Equifax数据泄露（暴露1.47亿人数据）成为事件响应失败的典型案例。Equifax的IR流程在多个阶段失败：配置错误的安全扫描器遗漏了Apache Struts漏洞长达78天（识别失败）；一旦检测到，事件在六天内未升级到执行团队（通信失败）；根除不完整，攻击者在初始修复尝试期间维持访问。随后的国会调查发现Equifax缺乏正式的、经过测试的事件响应手册。泄露后，Equifax投资14亿美元进行安全改造，包括带季度桌面演练的正式IR计划、预先授权的遏制程序和外部取证顾问。",
    "when_not_to_use": [
      "A rigid six-step playbook may not fit nation-state APT scenarios where multiple simultaneous intrusion vectors require parallel workstreams rather than sequential phases — use adaptive IR frameworks for advanced threats",
      "For low-severity events (single user phishing click, no credential compromise) — a lightweight security ticket workflow is more appropriate than full IR activation to avoid alert fatigue",
      "Without legal counsel involvement in playbook design — incident response decisions (public disclosure, law enforcement notification, ransom consideration) carry significant legal obligations that vary by jurisdiction"
    ],
    "when_not_to_use_zh": [
      "刚性的六步手册可能不适合国家级APT场景，其中多个同时入侵向量需要并行工作流而非顺序阶段——对高级威胁使用自适应IR框架",
      "对于低严重性事件（单个用户钓鱼点击，无凭证泄露）——轻量级安全工单工作流比完整IR激活更合适，以避免告警疲劳",
      "在没有法律顾问参与手册设计的情况下——事件响应决策（公开披露、执法通知、赎金考虑）承载着因司法管辖区而异的重大法律义务"
    ],
    "adopters": [
      "US-CERT",
      "Microsoft DART",
      "CrowdStrike Services",
      "Mandiant",
      "PwC Cybersecurity",
      "Goldman Sachs"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "security",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "SANS Institute (2004). \"Incident Handler's Handbook\". SANS Reading Room. sans.org/reading-room/whitepapers/incident.",
    "secondary_sources": [
      "NIST (2012). \"SP 800-61 Rev 2: Computer Security Incident Handling Guide\". NIST.",
      "Cichonski, P., Millar, T., Grance, T. & Scarfone, K. (2012). \"Computer Security Incident Handling Guide\". NIST SP 800-61.",
      "Luttgens, J., Pepe, M. & Mandia, K. (2014). \"Incident Response & Computer Forensics\", 3rd ed. McGraw-Hill."
    ],
    "typed_relations": [
      {
        "slug": "nist-cybersecurity-framework",
        "type": "complement"
      },
      {
        "slug": "defense-in-depth",
        "type": "complement"
      },
      {
        "slug": "data-loss-prevention",
        "type": "related"
      },
      {
        "slug": "threat-modeling-stride",
        "type": "complement"
      }
    ]
  },
  {
    "id": 315,
    "name": "Runtime Application Self-Protection (RASP)",
    "name_zh": "运行时应用自我保护（RASP）",
    "slug": "runtime-application-self-protection",
    "category": "security",
    "desc": "A security technology that instruments application runtimes to detect and block attacks from within the running application context, with access to call stacks, data flows, and execution context that perimeter controls cannot see",
    "desc_zh": "在运行中的应用程序上下文内部检测和阻止攻击的安全技术，可访问外围控制无法看到的调用栈、数据流和执行上下文",
    "steps": [
      "Instrument the application runtime: integrate a RASP agent (via JVM agent, .NET profiler API, native library injection, or language-specific module) that hooks into security-sensitive operations such as SQL query execution, file system access, deserialization, and OS command execution",
      "Define detection rules for RASP sensors: configure which runtime events constitute attack indicators — SQL injection (tainted input in query string), path traversal (user-controlled file path segments), SSRF (user-controlled URL in outbound HTTP calls) — and set initial sensor mode to 'detect only'",
      "Validate detection accuracy in staging: run the RASP agent against automated test suites and penetration test payloads in a non-production environment; tune false positive thresholds before enabling blocking mode to avoid impacting legitimate traffic",
      "Enable blocking mode for high-confidence sensors: switch SQL injection, command injection, and deserialization sensors to block mode in production; leave lower-confidence sensors in detect-only mode and route alerts to SIEM for human triage",
      "Integrate RASP telemetry into the security operations workflow: pipe RASP attack events into your SIEM/SOAR platform, correlate with WAF and network IDS signals, and define escalation playbooks for each attack category detected by the runtime agent"
    ],
    "steps_zh": [
      "对应用运行时进行埋点：集成RASP代理（通过JVM代理、.NET分析器API、原生库注入或特定语言模块），钩入安全敏感操作，如SQL查询执行、文件系统访问、反序列化和操作系统命令执行",
      "为RASP传感器定义检测规则：配置哪些运行时事件构成攻击指标——SQL注入（查询字符串中的污点输入）、路径遍历（用户控制的文件路径段）、SSRF（出站HTTP调用中用户控制的URL）——并将初始传感器模式设置为「仅检测」",
      "在预发布环境验证检测准确性：在非生产环境中对自动化测试套件和渗透测试载荷运行RASP代理；在启用阻止模式以避免影响合法流量之前，调整误报阈值",
      "对高置信度传感器启用阻止模式：在生产中将SQL注入、命令注入和反序列化传感器切换到阻止模式；将低置信度传感器保留在仅检测模式并将告警路由到SIEM进行人工分类",
      "将RASP遥测集成到安全运营工作流：将RASP攻击事件输入SIEM/SOAR平台，与WAF和网络IDS信号关联，并为运行时代理检测到的每个攻击类别定义升级剧本"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Instrument Runtime",
      "Detection Rules",
      "Validate Accuracy",
      "Block Mode",
      "SIEM Integrate"
    ],
    "viz_labels_zh": [
      "运行时埋点",
      "检测规则",
      "精度验证",
      "阻断模式",
      "SIEM集成"
    ],
    "related": [
      "waf-patterns",
      "defense-in-depth",
      "owasp-top-10",
      "security-by-design",
      "threat-modeling-stride"
    ],
    "tags": [
      "security",
      "rasp",
      "runtime-security",
      "appsec",
      "intrusion-detection"
    ],
    "origin_author": "Gartner (Joseph Feiman coined the term RASP in 2012); commercial implementations by Contrast Security, Sqreen",
    "origin_year": 2012,
    "origin_source": "Gartner (2012). \"Designing Application Security into Your Systems\". Feiman, J. (2012). Gartner Research Note.",
    "origin_source_zh": "Gartner（2012）。《将应用安全设计进您的系统》。Feiman, J.（2012）。Gartner研究报告。",
    "complexity": "intermediate",
    "abstraction_level": "component",
    "maturity_ring": "established",
    "quality_concerns": [
      "security",
      "reliability",
      "observability"
    ],
    "adopters": [
      "Capital One",
      "Société Générale",
      "Cisco",
      "Hulu",
      "Wayfair"
    ],
    "when_to_use": [
      "Applications exposed to the internet that handle sensitive data and cannot patch known vulnerabilities immediately due to release cycle constraints",
      "Regulated industries (finance, healthcare) where compliance frameworks (PCI-DSS, HIPAA) require real-time attack detection and evidence of active controls inside the application layer",
      "Zero-day vulnerability periods where a RASP agent can block exploitation of a newly disclosed library vulnerability before a patched version is deployed",
      "Applications that cannot be fully covered by network-layer WAF rules due to encrypted payloads, WebSocket traffic, or complex application-specific attack surfaces"
    ],
    "when_to_use_zh": [
      "暴露在互联网上且处理敏感数据的应用程序，由于发布周期限制无法立即修补已知漏洞",
      "监管行业（金融、医疗），合规框架（PCI-DSS、HIPAA）要求应用层内的实时攻击检测和主动控制证据",
      "零日漏洞期间，RASP代理可以在部署修补版本之前阻止对新披露库漏洞的利用",
      "由于加密有效载荷、WebSocket流量或复杂的应用特定攻击面，无法被网络层WAF规则完全覆盖的应用程序"
    ],
    "core_concepts": [
      "In-Process Instrumentation: RASP agents operate inside the application process and hook into language runtime APIs (JVMTI, .NET Profiler) to intercept security-sensitive operations at the point of execution, not at the network boundary",
      "Taint Tracking: RASP engines mark user-supplied data as 'tainted' at entry points (HTTP parameters, headers, cookies) and detect when tainted data reaches dangerous sinks (SQL query builders, file path constructors, OS command executors) without sanitization",
      "Context-Aware Detection: Because RASP has access to the full call stack at the point of a potential attack, it can distinguish between legitimate application behavior and injected payloads with far lower false-positive rates than pattern-matching WAFs",
      "Blocking vs. Detection Mode: In blocking mode, the RASP agent terminates the offending request and returns an error; in detection mode, it logs the attack event and passes the request through — allowing teams to validate accuracy before enabling enforcement",
      "Defense in Depth Position: RASP is an inner ring of defense that complements perimeter controls (WAF, API gateway), not a replacement. It is most effective when integrated into a layered security architecture"
    ],
    "core_concepts_zh": [
      "进程内埋点：RASP代理在应用程序进程内部运行，钩入语言运行时API（JVMTI、.NET分析器），在执行点而非网络边界拦截安全敏感操作",
      "污点追踪：RASP引擎在入口点（HTTP参数、头、Cookie）将用户提供的数据标记为「污点」，并检测污点数据何时在未经清理的情况下到达危险汇点（SQL查询构建器、文件路径构造器、操作系统命令执行器）",
      "上下文感知检测：由于RASP在潜在攻击点可以访问完整调用栈，它可以区分合法应用行为和注入的有效载荷，误报率远低于模式匹配WAF",
      "阻止与检测模式：在阻止模式下，RASP代理终止违规请求并返回错误；在检测模式下，它记录攻击事件并放行请求——允许团队在启用执行前验证准确性",
      "纵深防御位置：RASP是补充外围控制（WAF、API网关）的内层防御，而非替代品。当集成到分层安全架构中时效果最佳"
    ],
    "timeline": [
      [
        2012,
        "Gartner analyst Joseph Feiman coins the term 'Runtime Application Self-Protection' in a research note"
      ],
      [
        2015,
        "First commercial RASP products (Contrast Security, Prevoty) reach the market with Java and .NET support"
      ],
      [
        2018,
        "RASP adoption grows in financial services following high-profile Struts and Log4j-class vulnerabilities that RASP could have blocked"
      ],
      [
        2022,
        "Cloud-native RASP approaches emerge using eBPF for kernel-level syscall interception as an alternative to language-specific agents"
      ]
    ],
    "timeline_zh": [
      [
        2012,
        "Gartner分析师Joseph Feiman在研究报告中创造了「运行时应用自我保护」这一术语"
      ],
      [
        2015,
        "首批商业RASP产品（Contrast Security、Prevoty）支持Java和.NET上市"
      ],
      [
        2018,
        "在RASP本可阻止的高调Struts和Log4j类漏洞之后，金融服务中的RASP采用率增长"
      ],
      [
        2022,
        "使用eBPF进行内核级系统调用拦截的云原生RASP方案作为特定语言代理的替代品涌现"
      ]
    ],
    "dos": [
      "Do start RASP in detection-only mode and run it through a full regression and load test before enabling blocking — a false positive in blocking mode can take down production traffic",
      "Do correlate RASP events with WAF and SIEM data to build a complete picture of attacker behavior; RASP events that reach the runtime often indicate WAF bypasses worth investigating",
      "Do measure the RASP agent's performance overhead in production-representative load tests; accept up to 5% overhead as reasonable, escalate to the vendor if overhead exceeds 10%",
      "Do use RASP as a compensating control during zero-day vulnerability windows — it buys time to test and deploy patches without leaving systems fully exposed"
    ],
    "dos_zh": [
      "先以仅检测模式启动RASP，并在启用阻止前通过完整的回归和负载测试——阻止模式中的误报可能中断生产流量",
      "将RASP事件与WAF和SIEM数据关联，以建立攻击者行为的完整图像；到达运行时的RASP事件通常表明值得调查的WAF绕过",
      "在代表性生产负载测试中测量RASP代理的性能开销；接受最高5%的开销为合理，如果开销超过10%则升级到供应商",
      "在零日漏洞窗口期间使用RASP作为补偿控制——它争取时间测试和部署补丁，而不使系统完全暴露"
    ],
    "donts": [
      "Don't treat RASP as a substitute for secure coding practices — it is a safety net for exploitation attempts, not a license to write vulnerable code",
      "Don't enable blocking mode for all sensors simultaneously in production — start with the highest-confidence, lowest-false-positive sensors (SQL injection, command injection) and expand gradually",
      "Don't ignore RASP agent updates — the agent's detection rules must stay current with new attack patterns; an outdated RASP agent provides a false sense of security",
      "Don't underestimate the vendor lock-in risk — RASP agents are deeply integrated with the application runtime; migrating between vendors requires re-testing all detection and blocking rules"
    ],
    "donts_zh": [
      "不要将RASP视为安全编码实践的替代品——它是利用尝试的安全网，而非编写易受攻击代码的许可证",
      "不要在生产中同时为所有传感器启用阻止模式——从最高置信度、最低误报的传感器（SQL注入、命令注入）开始，逐步扩展",
      "不要忽视RASP代理更新——代理的检测规则必须与新的攻击模式保持同步；过时的RASP代理会提供虚假的安全感",
      "不要低估供应商锁定风险——RASP代理与应用运行时深度集成；在供应商之间迁移需要重新测试所有检测和阻止规则"
    ],
    "case_study_company": "Capital One",
    "case_study": "Capital One deployed Contrast Security's RASP agent across their Java-based banking microservices following the 2019 Capital One breach (caused by a misconfigured WAF, not an application vulnerability). As a compensating control during the Log4Shell (CVE-2021-44228) zero-day in December 2021, their RASP agents detected and blocked JNDI lookup injection attempts in real time across their application fleet within hours of the CVE publication — before patched Log4j versions were validated and deployed. Their security team credited RASP detection telemetry with providing definitive proof that no successful exploitation occurred during the vulnerability window.",
    "case_study_zh": "Capital One在2019年Capital One数据泄露（由错误配置的WAF而非应用漏洞引起）之后，在其基于Java的银行微服务中部署了Contrast Security的RASP代理。作为2021年12月Log4Shell（CVE-2021-44228）零日漏洞期间的补偿控制，他们的RASP代理在CVE发布后数小时内实时检测并阻止了整个应用舰队的JNDI查找注入尝试——早于修补的Log4j版本被验证和部署。他们的安全团队将RASP检测遥测归功于提供了在漏洞窗口期间没有发生成功利用的确凿证明。",
    "when_not_to_use": [
      "Applications with extremely tight latency SLOs (sub-millisecond P99) where even 1-2% RASP overhead is unacceptable",
      "Languages or runtimes for which mature RASP agents do not exist (many Go, Rust, Elixir services) — immature agents introduce more risk than they mitigate",
      "Serverless functions with very short execution durations where agent initialization overhead exceeds function execution time",
      "Teams without a security operations capability to triage RASP alerts — deploying RASP without alert handling creates alert fatigue and eventually agent misconfiguration"
    ],
    "when_not_to_use_zh": [
      "延迟SLO极严（P99亚毫秒级）的应用程序，即使1-2%的RASP开销也不可接受",
      "不存在成熟RASP代理的语言或运行时（许多Go、Rust、Elixir服务）——不成熟的代理引入的风险超过其缓解的风险",
      "执行时间极短的无服务器函数，代理初始化开销超过函数执行时间",
      "没有安全运营能力来分类RASP告警的团队——在没有告警处理的情况下部署RASP会产生告警疲劳并最终导致代理配置错误"
    ],
    "primary_source": "Gartner (2012). \"Designing Application Security into Your Systems\". Feiman, J. Research Note G00237047.",
    "primary_source_zh": "Gartner（2012）。《将应用安全设计进您的系统》。Feiman, J. 研究报告G00237047。",
    "secondary_sources": [
      "OWASP Foundation (2021). \"OWASP AppSensor Project: Runtime Application Self-Protection\". owasp.org/www-project-appsensor",
      "Contrast Security (2022). \"State of Application Security Report\". contrastsecurity.com"
    ],
    "secondary_sources_zh": [
      "OWASP基金会（2021）。《OWASP AppSensor项目：运行时应用自我保护》。owasp.org/www-project-appsensor",
      "Contrast Security（2022）。《应用安全状态报告》。contrastsecurity.com"
    ]
  },
  {
    "id": 121,
    "name": "Consensus Protocols (Raft/Paxos)",
    "name_zh": "共识协议（Raft/Paxos）",
    "slug": "consensus-protocols",
    "category": "distributed",
    "desc": "Algorithms for achieving agreement among distributed nodes despite failures",
    "desc_zh": "在节点可能故障的分布式系统中达成一致的算法",
    "steps": [
      "Define the replicated state machine: identify what state must be consistently replicated across all nodes in the cluster",
      "Elect a leader: use randomized timeouts (Raft) or proposal numbers (Paxos) to select a single node responsible for coordinating writes",
      "Replicate log entries: the leader appends client requests to its log and sends AppendEntries RPCs to followers, waiting for a majority quorum to acknowledge",
      "Commit and apply: once a majority of nodes have persisted the entry, the leader marks it committed and all nodes apply it to their state machines",
      "Handle leader failure: when followers detect a missing heartbeat, they increment the term, start a new election, and the cluster converges on a new leader"
    ],
    "steps_zh": [
      "定义复制状态机：确定集群中所有节点需要一致复制的状态内容",
      "选举领导者：使用随机超时（Raft）或提案编号（Paxos）选出一个负责协调写入的节点",
      "复制日志条目：领导者将客户端请求追加到日志中，并向跟随者发送AppendEntries RPC，等待多数派确认",
      "提交并应用：一旦多数节点持久化了该条目，领导者将其标记为已提交，所有节点将其应用到状态机",
      "处理领导者故障：当跟随者检测到心跳缺失时，递增任期号，发起新选举，集群收敛到新领导者"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "State Machine",
      "Leader Election",
      "Log Replication",
      "Commit",
      "Failure Recovery"
    ],
    "viz_labels_zh": [
      "状态机",
      "领导者选举",
      "日志复制",
      "提交",
      "故障恢复"
    ],
    "related": [
      "cap-theorem",
      "leader-election",
      "two-phase-commit"
    ],
    "tags": [
      "consensus",
      "replication",
      "fault-tolerance",
      "distributed-state"
    ],
    "origin_author": "Leslie Lamport (Paxos, 1998); Diego Ongaro & John Ousterhout (Raft, 2014)",
    "origin_source": "The Part-Time Parliament (ACM TOCS 1998); In Search of an Understandable Consensus Algorithm (USENIX ATC 2014); Kleppmann, Designing Data-Intensive Applications, Ch. 9",
    "origin_source_zh": "《兼职议会》（ACM TOCS 1998）；《寻找一种可理解的共识算法》（USENIX ATC 2014）；Kleppmann《数据密集型应用系统设计》第9章",
    "complexity": "advanced",
    "when_to_use": [
      "When building a replicated log or replicated state machine that must tolerate node failures without data loss",
      "When implementing a distributed coordination service like a lock manager, configuration store, or membership registry",
      "When strong consistency guarantees are required and the system can tolerate the latency of quorum writes",
      "When you need automatic leader failover without manual intervention in a clustered database or message broker"
    ],
    "when_to_use_zh": [
      "当构建必须容忍节点故障且不丢失数据的复制日志或复制状态机时",
      "当实现锁管理器、配置存储或成员注册表等分布式协调服务时",
      "当需要强一致性保证，且系统可以容忍法定人数写入的延迟时",
      "当需要在集群数据库或消息代理中实现无需人工干预的自动领导者故障转移时"
    ],
    "core_concepts": [
      "Quorum: A majority of nodes (e.g., 3 of 5) must agree before a value is committed, ensuring any two quorums overlap",
      "Term/epoch: A monotonically increasing logical clock that partitions time into leadership periods, preventing stale leaders from issuing writes",
      "Log replication: The leader maintains an ordered log of commands and ensures all followers converge to the same log sequence",
      "Safety property: Once a log entry is committed, no future leader can overwrite it, guaranteeing linearizable reads and writes",
      "Liveness property: As long as a majority of nodes are reachable and can communicate, the system will eventually elect a leader and make progress"
    ],
    "core_concepts_zh": [
      "法定人数：多数节点（如5中的3）必须同意后值才能提交，确保任意两个法定人数有交集",
      "任期/纪元：单调递增的逻辑时钟，将时间划分为领导期，防止过期领导者发出写入",
      "日志复制：领导者维护有序的命令日志，确保所有跟随者收敛到相同的日志序列",
      "安全性：一旦日志条目被提交，未来的领导者都无法覆盖它，保证线性一致的读写",
      "活性：只要多数节点可达且能通信，系统最终会选出领导者并继续处理请求"
    ],
    "timeline": [
      [
        "1989",
        "Leslie Lamport writes the original Paxos paper using the metaphor of a Greek parliament"
      ],
      [
        "1998",
        "The Part-Time Parliament is finally published in ACM Transactions on Computer Systems"
      ],
      [
        "2001",
        "Lamport publishes Paxos Made Simple to make the algorithm more accessible"
      ],
      [
        "2014",
        "Diego Ongaro and John Ousterhout publish Raft as a more understandable alternative to Paxos"
      ],
      [
        "2017",
        "Kleppmann's DDIA synthesizes consensus protocols in the context of modern data systems"
      ]
    ],
    "timeline_zh": [
      [
        "1989",
        "Leslie Lamport使用希腊议会的比喻撰写了最初的Paxos论文"
      ],
      [
        "1998",
        "《兼职议会》最终发表在ACM计算机系统汇刊上"
      ],
      [
        "2001",
        "Lamport发表《Paxos Made Simple》使算法更易理解"
      ],
      [
        "2014",
        "Diego Ongaro和John Ousterhout发表Raft作为比Paxos更易理解的替代方案"
      ],
      [
        "2017",
        "Kleppmann的《数据密集型应用系统设计》在现代数据系统背景下综合介绍共识协议"
      ]
    ],
    "dos": [
      "Do deploy an odd number of nodes (3, 5, 7) because even numbers increase the chance of split votes without improving fault tolerance",
      "Do persist log entries and term metadata to stable storage before responding because in-memory-only state leads to data loss on restart",
      "Do implement pre-vote protocol extensions because they prevent disruptive elections from partitioned nodes rejoining the cluster",
      "Do benchmark your quorum latency under realistic network conditions because consensus adds round-trip overhead to every write"
    ],
    "dos_zh": [
      "部署奇数个节点（3、5、7），因为偶数节点增加了分裂投票的概率而不提升容错能力",
      "在响应前将日志条目和任期元数据持久化到稳定存储，因为仅内存状态在重启时会导致数据丢失",
      "实现预投票协议扩展，防止被分区隔离的节点重新加入集群时引发破坏性选举",
      "在真实网络条件下对法定人数延迟进行基准测试，因为共识会为每次写入增加往返开销"
    ],
    "donts": [
      "Don't run consensus across wide-area networks without accounting for latency because cross-datacenter round trips can make quorum writes unacceptably slow",
      "Don't use consensus for high-throughput data that tolerates eventual consistency because the overhead of quorum writes is unnecessary",
      "Don't ignore log compaction and snapshotting because unbounded log growth will exhaust disk space and slow down follower recovery",
      "Don't assume consensus alone solves all distributed problems because it handles replication but not sharding, load balancing, or schema evolution"
    ],
    "donts_zh": [
      "不要在不考虑延迟的情况下跨广域网运行共识，因为跨数据中心的往返可能使法定人数写入慢到不可接受",
      "不要对可容忍最终一致性的高吞吐数据使用共识，因为法定人数写入的开销是不必要的",
      "不要忽视日志压缩和快照，因为无限增长的日志会耗尽磁盘空间并减慢跟随者恢复速度",
      "不要假设共识能解决所有分布式问题，它处理复制但不处理分片、负载均衡或模式演进"
    ],
    "case_study_company": "etcd / Kubernetes",
    "case_study": "Kubernetes relies on etcd, a distributed key-value store implementing the Raft consensus protocol, as its single source of truth for all cluster state. Every API server write (pod scheduling, config changes, secret updates) is committed through Raft to an etcd cluster of typically 3 or 5 nodes. This design ensures that even if a minority of etcd nodes fail, the cluster state remains consistent and recoverable. When CoreOS (later acquired by Red Hat) originally built etcd in 2013, they chose Raft over Paxos specifically for its understandability, which allowed a broader contributor base to verify correctness.",
    "case_study_zh": "Kubernetes依赖etcd——一个实现Raft共识协议的分布式键值存储——作为所有集群状态的唯一事实来源。每个API服务器写入（Pod调度、配置变更、密钥更新）都通过Raft提交到通常由3或5个节点组成的etcd集群。这种设计确保即使少数etcd节点故障，集群状态仍然一致且可恢复。当CoreOS（后被Red Hat收购）在2013年构建etcd时，他们选择Raft而非Paxos，正是因为其可理解性，这使得更广泛的贡献者能够验证正确性。",
    "when_not_to_use": [
      "High-throughput write-heavy workloads where eventual consistency is acceptable and quorum latency is a bottleneck",
      "Single-node or embedded systems where there is no replication requirement",
      "Systems where availability during network partitions is more important than consistency, favoring AP designs instead"
    ],
    "when_not_to_use_zh": [
      "最终一致性可接受且法定人数延迟成为瓶颈的高吞吐写密集型工作负载",
      "没有复制需求的单节点或嵌入式系统",
      "网络分区期间可用性比一致性更重要的系统，应改用AP设计"
    ],
    "adopters": [
      "etcd/Kubernetes",
      "CockroachDB",
      "TiKV/TiDB",
      "Consul (HashiCorp)",
      "Apache ZooKeeper (ZAB)"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Lamport, L. (1998). \"The Part-Time Parliament\". ACM Transactions on Computer Systems, 16(2).",
    "secondary_sources": [
      "Ongaro, D. & Ousterhout, J. (2014). \"In Search of an Understandable Consensus Algorithm\". Proceedings of USENIX ATC.",
      "Lamport, L. (2001). \"Paxos Made Simple\". ACM SIGACT News, 32(4).",
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 9. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "cap-theorem",
        "type": "prerequisite"
      },
      {
        "slug": "leader-election",
        "type": "complement"
      },
      {
        "slug": "two-phase-commit",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 122,
    "name": "Eventual Consistency",
    "name_zh": "最终一致性",
    "slug": "eventual-consistency",
    "category": "distributed",
    "desc": "Embrace temporary inconsistency across replicas in exchange for higher availability",
    "desc_zh": "以副本间的暂时不一致换取更高的可用性",
    "steps": [
      "Identify which data in the system can tolerate temporary inconsistency by classifying operations into strong-consistency-required vs. eventually-consistent",
      "Choose a replication strategy: asynchronous replication with conflict detection, or anti-entropy protocols that periodically reconcile divergent replicas",
      "Implement a conflict resolution policy: last-writer-wins (LWW), version vectors, CRDTs, or application-level merge functions depending on data semantics",
      "Design read paths with staleness awareness: use read-your-writes consistency, monotonic reads, or causal consistency to meet user-facing expectations",
      "Add observability for convergence lag: monitor replication delay, conflict rates, and anti-entropy cycle times to detect when the system is slow to converge"
    ],
    "steps_zh": [
      "识别系统中哪些数据可以容忍暂时不一致，将操作分类为需要强一致性与可接受最终一致性",
      "选择复制策略：带冲突检测的异步复制，或定期协调分歧副本的反熵协议",
      "实现冲突解决策略：根据数据语义选择最后写入者获胜（LWW）、版本向量、CRDT或应用层合并函数",
      "设计具有过期感知的读取路径：使用读自己写、单调读或因果一致性来满足面向用户的期望",
      "为收敛延迟添加可观测性：监控复制延迟、冲突率和反熵周期时间，检测系统收敛缓慢的情况"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Classify Data",
      "Replication",
      "Conflict Resolution",
      "Read Consistency",
      "Convergence"
    ],
    "viz_labels_zh": [
      "数据分类",
      "数据复制",
      "冲突解决",
      "读一致性",
      "收敛延迟"
    ],
    "related": [
      "cap-theorem",
      "consensus-protocols",
      "cqrs-pattern",
      "saga-pattern"
    ],
    "tags": [
      "consistency",
      "availability",
      "replication",
      "conflict-resolution"
    ],
    "origin_author": "Werner Vogels, 2008",
    "origin_source": "Eventually Consistent (ACM Queue, 2008); Kleppmann, Designing Data-Intensive Applications, Ch. 5 & 9; Vogels, Amazon's Dynamo paper (SOSP 2007)",
    "origin_source_zh": "《最终一致性》（ACM Queue 2008）；Kleppmann《数据密集型应用系统设计》第5章和第9章；Vogels，Amazon Dynamo论文（SOSP 2007）",
    "complexity": "intermediate",
    "when_to_use": [
      "When the system must remain available for writes even during network partitions or node failures",
      "When data is replicated across geographically distant datacenters and synchronous replication would add unacceptable latency",
      "When the business domain naturally tolerates stale reads, such as social media feeds, product catalogs, or DNS records",
      "When write throughput requirements exceed what a single-leader or consensus-based system can handle"
    ],
    "when_to_use_zh": [
      "当系统必须在网络分区或节点故障期间仍然可写时",
      "当数据跨地理位置遥远的数据中心复制且同步复制会增加不可接受的延迟时",
      "当业务领域天然容忍过期读取时，如社交媒体动态、产品目录或DNS记录",
      "当写入吞吐需求超过单领导者或基于共识的系统所能承受的范围时"
    ],
    "core_concepts": [
      "Convergence guarantee: All replicas will eventually reach the same state if no new updates are made, though the convergence window is unbounded",
      "Conflict resolution: When concurrent writes occur on different replicas, the system must deterministically resolve conflicts using LWW, vector clocks, or CRDTs",
      "Read-your-writes consistency: A session guarantee that a client always sees its own previous writes, even if other replicas are still catching up",
      "Anti-entropy: Background processes (like Merkle tree comparison or read-repair) that detect and fix divergence between replicas",
      "Tunable consistency: Systems like Cassandra allow per-query consistency levels (ONE, QUORUM, ALL) to balance latency and consistency on a per-operation basis"
    ],
    "core_concepts_zh": [
      "收敛保证：如果没有新的更新，所有副本最终将达到相同状态，但收敛窗口是无界的",
      "冲突解决：当不同副本上发生并发写入时，系统必须使用LWW、向量时钟或CRDT确定性地解决冲突",
      "读自己写一致性：一种会话保证，确保客户端始终看到自己之前的写入，即使其他副本仍在追赶",
      "反熵：检测并修复副本间分歧的后台进程（如Merkle树比较或读修复）",
      "可调一致性：Cassandra等系统允许按查询设置一致性级别（ONE、QUORUM、ALL），逐操作平衡延迟与一致性"
    ],
    "timeline": [
      [
        "1978",
        "Lamport defines happens-before relation, laying groundwork for reasoning about consistency in distributed systems"
      ],
      [
        "2007",
        "Amazon publishes the Dynamo paper at SOSP, demonstrating eventual consistency at massive scale with sloppy quorums"
      ],
      [
        "2008",
        "Werner Vogels publishes 'Eventually Consistent' in ACM Queue, formalizing the consistency spectrum"
      ],
      [
        "2011",
        "CRDTs (Conflict-free Replicated Data Types) formalized by Shapiro et al., enabling automatic conflict resolution"
      ],
      [
        "2017",
        "Kleppmann's DDIA provides a comprehensive synthesis of consistency models and their trade-offs for practitioners"
      ]
    ],
    "timeline_zh": [
      [
        "1978",
        "Lamport定义先发生关系，奠定分布式系统中推理一致性的基础"
      ],
      [
        "2007",
        "Amazon在SOSP发表Dynamo论文，展示使用松散法定人数实现大规模最终一致性"
      ],
      [
        "2008",
        "Werner Vogels在ACM Queue发表「最终一致性」，形式化一致性谱系"
      ],
      [
        "2011",
        "Shapiro等人形式化CRDT（无冲突复制数据类型），实现自动冲突解决"
      ],
      [
        "2017",
        "Kleppmann的《数据密集型应用系统设计》为从业者综合介绍一致性模型及其权衡"
      ]
    ],
    "dos": [
      "Do classify each piece of data by its consistency requirements because treating all data as eventually consistent leads to subtle correctness bugs",
      "Do implement read-your-writes guarantees at the session level because users who cannot see their own updates lose trust in the system",
      "Do monitor replication lag continuously because high convergence delays can indicate infrastructure problems or capacity exhaustion",
      "Do choose conflict resolution strategies that match domain semantics because generic LWW silently drops valid concurrent updates"
    ],
    "dos_zh": [
      "按一致性需求对每条数据进行分类，因为将所有数据都视为最终一致会导致微妙的正确性缺陷",
      "在会话级别实现读自己写保证，因为看不到自己更新的用户会丧失对系统的信任",
      "持续监控复制延迟，因为高收敛延迟可能表明基础设施问题或容量耗尽",
      "选择与领域语义匹配的冲突解决策略，因为通用的LWW会静默丢弃有效的并发更新"
    ],
    "donts": [
      "Don't assume eventually consistent means 'consistent enough' for financial transactions because money transfers require strong consistency to prevent double-spending",
      "Don't ignore the convergence window in SLA definitions because users may observe stale data during high-replication-lag periods",
      "Don't use wall-clock timestamps for conflict resolution across nodes because clock skew between machines makes LWW unreliable",
      "Don't forget to test failure scenarios because eventual consistency bugs typically manifest only under network partitions or high load"
    ],
    "donts_zh": [
      "不要假设最终一致性对金融交易「足够一致」，因为资金转账需要强一致性来防止双重花费",
      "不要在SLA定义中忽略收敛窗口，因为用户在高复制延迟期间可能观察到过期数据",
      "不要使用墙钟时间戳在节点间进行冲突解决，因为机器间的时钟偏差使LWW不可靠",
      "不要忘记测试故障场景，因为最终一致性缺陷通常只在网络分区或高负载下才会显现"
    ],
    "case_study_company": "Amazon (DynamoDB)",
    "case_study": "Amazon's DynamoDB, descended from the original Dynamo paper (2007), is architected around eventual consistency as the default read mode. During Prime Day 2022, DynamoDB served over 105 million requests per second across globally distributed tables. The eventually consistent read path avoids quorum overhead, delivering single-digit millisecond latency. For the shopping cart — Dynamo's original motivation — Amazon uses a 'last-writer-wins with client-side merge' strategy where the cart is an additive set, so no item is ever silently lost even when concurrent writes conflict across regions.",
    "case_study_zh": "Amazon的DynamoDB源自2007年的Dynamo论文，以最终一致性作为默认读取模式。在2022年Prime Day期间，DynamoDB在全球分布式表中每秒处理超过1.05亿次请求。最终一致性读取路径避免了法定人数开销，实现个位数毫秒级延迟。对于购物车——Dynamo最初的设计动机——Amazon使用「最后写入者获胜加客户端合并」策略，将购物车视为可加集合，即使跨区域的并发写入冲突也不会静默丢失任何商品。",
    "when_not_to_use": [
      "Financial ledger systems where double-spending or lost updates are unacceptable",
      "Distributed locking or leader election where stale reads can cause split-brain scenarios",
      "Systems where regulatory compliance demands strong consistency and audit trails for every state transition"
    ],
    "when_not_to_use_zh": [
      "双重花费或更新丢失不可接受的金融账本系统",
      "过期读取可能导致脑裂场景的分布式锁或领导者选举",
      "法规合规要求强一致性且每个状态转换都需要审计记录的系统"
    ],
    "adopters": [
      "Amazon DynamoDB",
      "Apache Cassandra",
      "Riak",
      "CouchDB",
      "Voldemort"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "scalability",
      "performance",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Vogels, W. (2009). \"Eventually Consistent\". Communications of the ACM, 52(1).",
    "secondary_sources": [
      "DeCandia, G. et al. (2007). \"Dynamo: Amazon's Highly Available Key-Value Store\". Proceedings of ACM SOSP.",
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 5 & 9. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "cap-theorem",
        "type": "prerequisite"
      },
      {
        "slug": "consensus-protocols",
        "type": "alternative"
      },
      {
        "slug": "cqrs-pattern",
        "type": "complement"
      },
      {
        "slug": "saga-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 123,
    "name": "Backpressure Pattern",
    "name_zh": "背压模式",
    "slug": "backpressure-pattern",
    "category": "distributed",
    "desc": "Flow control mechanism where downstream consumers signal upstream producers to slow down",
    "desc_zh": "下游消费者向上游生产者发出减速信号的流量控制机制",
    "steps": [
      "Identify the bottleneck: profile the pipeline to find where the slowest consumer or resource constraint creates a mismatch between production and consumption rates",
      "Instrument buffer occupancy: add metrics to every queue, channel, and buffer in the pipeline to observe fill levels and detect when capacity is approaching limits",
      "Implement the signaling mechanism: choose between reactive pull-based demand (Reactive Streams), TCP window-based flow control, credit-based systems, or explicit NACK/pause signals",
      "Propagate backpressure end-to-end: ensure every stage in the pipeline forwards pressure upstream rather than absorbing it locally with unbounded buffers",
      "Define overflow strategies: decide what happens when backpressure fails to prevent overload — drop newest, drop oldest, sample, or reject with an error to the caller"
    ],
    "steps_zh": [
      "识别瓶颈：对管道进行性能分析，找到最慢的消费者或资源约束导致生产与消费速率不匹配的位置",
      "监测缓冲区占用：为管道中每个队列、通道和缓冲区添加指标，观察填充水平并检测何时接近容量限制",
      "实现信号机制：选择响应式拉取需求（Reactive Streams）、TCP窗口流控、基于信用的系统或显式NACK/暂停信号",
      "端到端传播背压：确保管道中每个阶段都将压力向上游传递，而非用无界缓冲区在本地吸收",
      "定义溢出策略：决定背压无法阻止过载时的处理方式——丢弃最新、丢弃最旧、采样或向调用者返回错误"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Bottleneck",
      "Buffer Metrics",
      "Signal Mechanism",
      "End-to-End",
      "Overflow Strategy"
    ],
    "viz_labels_zh": [
      "瓶颈识别",
      "缓冲监控",
      "背压信号",
      "端到端传播",
      "溢出策略"
    ],
    "related": [
      "circuit-breaker-pattern",
      "bulkhead-pattern",
      "reactive-extensions"
    ],
    "tags": [
      "flow-control",
      "resilience",
      "async",
      "streaming"
    ],
    "origin_author": "Concept from networking (TCP flow control); formalized in Reactive Streams (2013-2015)",
    "origin_source": "Reactive Streams Specification (2015); Nygard, Release It!, Ch. 5; Kleppmann, Designing Data-Intensive Applications, Ch. 11",
    "origin_source_zh": "Reactive Streams规范（2015）；Nygard《Release It!》第5章；Kleppmann《数据密集型应用系统设计》第11章",
    "complexity": "intermediate",
    "when_to_use": [
      "When a fast producer overwhelms a slow consumer, causing memory exhaustion or cascading failures in the pipeline",
      "When building real-time streaming data pipelines where data loss is unacceptable but throughput varies unpredictably",
      "When microservices have asymmetric processing speeds and unbounded message queues risk out-of-memory crashes",
      "When designing async event-driven systems where the event rate can spike during peak traffic periods"
    ],
    "when_to_use_zh": [
      "当快速生产者压垮慢速消费者，导致内存耗尽或管道级联故障时",
      "当构建实时流数据管道，数据丢失不可接受但吞吐量不可预测地变化时",
      "当微服务的处理速度不对称，无界消息队列有内存溢出崩溃风险时",
      "当设计异步事件驱动系统，事件速率在流量高峰期可能突增时"
    ],
    "core_concepts": [
      "Demand signaling: Consumers explicitly request a specific number of items they can handle, preventing producers from pushing more than the consumer can process",
      "Bounded buffers: Fixed-size queues that block or reject producers when full, converting memory pressure into temporal backpressure",
      "End-to-end propagation: Backpressure must flow from the slowest stage all the way back to the original source; a single unbounded buffer breaks the chain",
      "Overflow policies: Strategies for when backpressure is insufficient — drop-head, drop-tail, sample, buffer-to-disk, or reject with HTTP 429/503",
      "Reactive Streams protocol: A standard (adopted as java.util.concurrent.Flow in JDK 9) defining Publisher, Subscriber, Subscription, and Processor interfaces with non-blocking demand signaling"
    ],
    "core_concepts_zh": [
      "需求信号：消费者显式请求其能处理的特定数量项目，阻止生产者推送超出消费者处理能力的数据",
      "有界缓冲区：固定大小的队列在满时阻塞或拒绝生产者，将内存压力转换为时间维度的背压",
      "端到端传播：背压必须从最慢的阶段一路回传到原始数据源；单个无界缓冲区就会打破链条",
      "溢出策略：背压不足时的处理策略——丢弃队首、丢弃队尾、采样、缓冲到磁盘或以HTTP 429/503拒绝",
      "Reactive Streams协议：定义Publisher、Subscriber、Subscription和Processor接口的标准（JDK 9中采纳为java.util.concurrent.Flow），支持非阻塞需求信号"
    ],
    "timeline": [
      [
        "1988",
        "Van Jacobson designs TCP congestion control with window-based flow control, a foundational backpressure mechanism"
      ],
      [
        "2013",
        "Engineers from Netflix, Pivotal, Lightbend, and Red Hat begin the Reactive Streams initiative"
      ],
      [
        "2015",
        "Reactive Streams 1.0 specification published, defining the Publisher-Subscriber contract with demand signaling"
      ],
      [
        "2017",
        "JDK 9 adopts Reactive Streams as java.util.concurrent.Flow, bringing backpressure to the Java standard library"
      ],
      [
        "2018",
        "Nygard's Release It! second edition covers backpressure as a core stability pattern for production systems"
      ]
    ],
    "timeline_zh": [
      [
        "1988",
        "Van Jacobson设计TCP拥塞控制及基于窗口的流控，奠定了背压机制的基础"
      ],
      [
        "2013",
        "Netflix、Pivotal、Lightbend和Red Hat的工程师发起Reactive Streams倡议"
      ],
      [
        "2015",
        "Reactive Streams 1.0规范发布，定义带需求信号的Publisher-Subscriber契约"
      ],
      [
        "2017",
        "JDK 9将Reactive Streams采纳为java.util.concurrent.Flow，将背压引入Java标准库"
      ],
      [
        "2018",
        "Nygard《Release It!》第二版将背压作为生产系统核心稳定性模式进行介绍"
      ]
    ],
    "dos": [
      "Do propagate backpressure end-to-end through every stage because a single unbounded buffer creates a hidden failure point",
      "Do monitor buffer fill ratios as leading indicators because they reveal impending overload before out-of-memory errors occur",
      "Do choose overflow strategies explicitly for each pipeline stage because the right policy depends on whether data loss is acceptable",
      "Do test backpressure behavior under peak load conditions because backpressure bugs only manifest when the system is under stress"
    ],
    "dos_zh": [
      "将背压端到端传播通过每个阶段，因为单个无界缓冲区会形成隐藏的故障点",
      "将缓冲区填充率作为先导指标进行监控，因为它们在内存溢出错误发生之前就能揭示即将到来的过载",
      "为每个管道阶段显式选择溢出策略，因为正确的策略取决于数据丢失是否可接受",
      "在峰值负载条件下测试背压行为，因为背压缺陷只在系统承受压力时才会显现"
    ],
    "donts": [
      "Don't use unbounded queues as a substitute for backpressure because they merely convert a throughput problem into a memory problem",
      "Don't drop data silently without metrics because undetected data loss undermines system correctness and erodes operator trust",
      "Don't apply backpressure only at the network edge because internal bottlenecks need their own flow control too",
      "Don't conflate backpressure with rate limiting because backpressure is adaptive and demand-driven while rate limiting is a fixed policy"
    ],
    "donts_zh": [
      "不要用无界队列替代背压，因为它们只是将吞吐问题转换为内存问题",
      "不要在没有指标的情况下静默丢弃数据，因为未被检测到的数据丢失会损害系统正确性并侵蚀运维信任",
      "不要仅在网络边缘应用背压，因为内部瓶颈也需要自己的流量控制",
      "不要将背压与速率限制混为一谈，因为背压是自适应的需求驱动机制，而速率限制是固定策略"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix's streaming platform uses RxJava and Project Reactor extensively, both of which implement Reactive Streams backpressure. When Netflix's personalization service generates recommendation updates, downstream services (like the row-sorting service) can vary in processing capacity by 10x depending on the device type being served. Without backpressure, the recommendation engine would flood slower consumers, causing cascading timeouts. By implementing pull-based demand signaling, each consumer requests only what it can handle, and the recommendation engine automatically adjusts its emission rate. This pattern reduced out-of-memory incidents in the recommendation pipeline by over 90%.",
    "case_study_zh": "Netflix的流媒体平台广泛使用RxJava和Project Reactor，两者都实现了Reactive Streams背压。当Netflix的个性化服务生成推荐更新时，下游服务（如行排序服务）的处理能力根据服务的设备类型可能相差10倍。没有背压时，推荐引擎会淹没较慢的消费者，导致级联超时。通过实现基于拉取的需求信号，每个消费者只请求其能处理的数量，推荐引擎自动调整其发射速率。该模式将推荐管道中的内存溢出事件减少了90%以上。",
    "when_not_to_use": [
      "Fire-and-forget telemetry where dropping some data points is preferable to slowing the producer",
      "Simple request-response APIs where synchronous blocking naturally throttles the caller",
      "Batch processing systems where the entire dataset is known in advance and can be partitioned statically"
    ],
    "when_not_to_use_zh": [
      "丢弃部分数据点比减慢生产者更可取的即发即忘遥测场景",
      "同步阻塞自然节流调用者的简单请求-响应API",
      "整个数据集预先已知且可以静态分区的批处理系统"
    ],
    "adopters": [
      "Netflix (RxJava)",
      "Akka Streams (Lightbend)",
      "Project Reactor (Spring)",
      "Apache Flink",
      "Apache Kafka (consumer fetch)"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability",
      "performance"
    ],
    "maturity_ring": "established",
    "primary_source": "Reactive Streams Contributors (2015). \"Reactive Streams Specification\". reactive-streams.org.",
    "secondary_sources": [
      "Nygard, M.T. (2018). \"Release It!\", 2nd ed., Ch. 5. Pragmatic Bookshelf.",
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 11. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "circuit-breaker-pattern",
        "type": "complement"
      },
      {
        "slug": "bulkhead-pattern",
        "type": "complement"
      },
      {
        "slug": "reactive-extensions",
        "type": "complement"
      }
    ]
  },
  {
    "id": 124,
    "name": "Sidecar Pattern",
    "name_zh": "边车模式",
    "slug": "sidecar-pattern",
    "category": "distributed",
    "desc": "Attach auxiliary processes alongside primary services for cross-cutting concerns",
    "desc_zh": "在主服务旁部署辅助进程以处理横切关注点",
    "steps": [
      "Identify cross-cutting concerns: determine which capabilities (logging, monitoring, TLS termination, service discovery, circuit breaking) should be extracted from the application code",
      "Design the sidecar contract: define how the primary service and sidecar communicate — shared localhost network, Unix domain sockets, or shared filesystem volumes",
      "Deploy the sidecar as a co-located process: configure the orchestrator (e.g., Kubernetes pod) to schedule the sidecar container alongside the primary container with shared lifecycle",
      "Route traffic through the sidecar: use iptables rules, init containers, or transparent proxying so that inbound and outbound traffic flows through the sidecar without application code changes",
      "Manage sidecar lifecycle independently: update, configure, and version the sidecar proxy separately from the application, enabling platform-wide policy changes without redeploying services"
    ],
    "steps_zh": [
      "识别横切关注点：确定哪些能力（日志、监控、TLS终止、服务发现、熔断）应从应用代码中提取",
      "设计边车契约：定义主服务与边车的通信方式——共享localhost网络、Unix域套接字或共享文件系统卷",
      "将边车作为共置进程部署：配置编排器（如Kubernetes Pod）将边车容器与主容器共同调度，共享生命周期",
      "将流量路由通过边车：使用iptables规则、init容器或透明代理，使入站和出站流量在无需修改应用代码的情况下经过边车",
      "独立管理边车生命周期：独立于应用更新、配置和版本化边车代理，实现无需重新部署服务即可进行平台级策略变更"
    ],
    "ai_relevant": true,
    "viz_type": "tree",
    "viz_labels": [
      "App Container",
      "Sidecar",
      "Shared Network",
      "Traffic Route",
      "Lifecycle"
    ],
    "viz_labels_zh": [
      "应用容器",
      "Sidecar",
      "共享网络",
      "流量路由",
      "生命周期"
    ],
    "related": [
      "service-mesh-pattern",
      "hexagonal-architecture",
      "bulkhead-pattern"
    ],
    "tags": [
      "infrastructure",
      "proxy",
      "microservices",
      "cross-cutting-concerns"
    ],
    "origin_author": "Microsoft Azure Architecture Center (formalized 2016); popularized by Envoy/Istio (2016-2017)",
    "origin_source": "Newman, Building Microservices, Ch. 8 (2nd ed., 2021); Nygard, Release It!, Ch. 17; Burns, Designing Distributed Systems (O'Reilly, 2018)",
    "origin_source_zh": "Newman《构建微服务》第8章（第二版，2021）；Nygard《Release It!》第17章；Burns《设计分布式系统》（O'Reilly，2018）",
    "complexity": "intermediate",
    "when_to_use": [
      "When multiple services written in different languages need the same cross-cutting capabilities without duplicating libraries in each language",
      "When platform teams want to enforce consistent networking policies (mTLS, retries, rate limiting) across all services transparently",
      "When the application team should not need to understand or maintain infrastructure concerns like service mesh configuration",
      "When migrating legacy applications to a service mesh where modifying application code is impractical"
    ],
    "when_to_use_zh": [
      "当用不同语言编写的多个服务需要相同的横切能力而不想在每种语言中重复库时",
      "当平台团队希望透明地在所有服务间强制一致的网络策略（mTLS、重试、速率限制）时",
      "当应用团队不应需要理解或维护服务网格配置等基础设施关注点时",
      "当迁移遗留应用到服务网格且修改应用代码不现实时"
    ],
    "core_concepts": [
      "Co-location: The sidecar runs in the same network namespace and scheduling unit (e.g., Kubernetes pod) as the primary, sharing localhost and lifecycle",
      "Transparent interception: Traffic is redirected through the sidecar via iptables or CNI plugins so the application communicates normally without awareness of the proxy",
      "Polyglot support: Because the sidecar operates at the network layer, it provides uniform capabilities regardless of the primary service's programming language",
      "Separation of concerns: Application developers focus on business logic while platform engineers manage the sidecar configuration and upgrades",
      "Control plane / data plane split: A centralized control plane pushes configuration to distributed sidecar data planes, enabling fleet-wide policy changes"
    ],
    "core_concepts_zh": [
      "共置：边车与主服务运行在同一网络命名空间和调度单元（如Kubernetes Pod）中，共享localhost和生命周期",
      "透明拦截：流量通过iptables或CNI插件重定向到边车，应用在不感知代理的情况下正常通信",
      "多语言支持：因为边车在网络层操作，无论主服务使用何种编程语言都能提供统一能力",
      "关注点分离：应用开发者专注业务逻辑，平台工程师管理边车配置和升级",
      "控制平面/数据平面分离：集中式控制平面向分布式边车数据平面推送配置，实现全舰队策略变更"
    ],
    "timeline": [
      [
        "2013",
        "Netflix's Prana sidecar provides service discovery and health checking for non-JVM services in the Netflix OSS ecosystem"
      ],
      [
        "2016",
        "Lyft open-sources Envoy, a high-performance L7 proxy designed from the ground up as a sidecar for microservices"
      ],
      [
        "2017",
        "Istio launches using Envoy as its sidecar data plane, popularizing the sidecar pattern for service mesh architectures"
      ],
      [
        "2021",
        "Newman's Building Microservices 2nd edition dedicates coverage to the sidecar pattern as a standard microservices infrastructure approach"
      ],
      [
        "2023",
        "Ambient mesh architectures emerge (Istio ambient mode) exploring per-node proxies as an alternative to per-pod sidecars"
      ]
    ],
    "timeline_zh": [
      [
        "2013",
        "Netflix的Prana边车为Netflix OSS生态中的非JVM服务提供服务发现和健康检查"
      ],
      [
        "2016",
        "Lyft开源Envoy，一个从零设计为微服务边车的高性能L7代理"
      ],
      [
        "2017",
        "Istio使用Envoy作为边车数据平面发布，推动边车模式在服务网格架构中的普及"
      ],
      [
        "2021",
        "Newman《构建微服务》第二版将边车模式作为标准微服务基础设施方法进行专门介绍"
      ],
      [
        "2023",
        "环境网格架构出现（Istio ambient模式），探索每节点代理作为每Pod边车的替代方案"
      ]
    ],
    "dos": [
      "Do keep the sidecar lightweight and single-purpose because a bloated sidecar adds latency and resource overhead to every pod",
      "Do version and release the sidecar independently from the application because coupling their release cycles defeats the purpose of separation",
      "Do provide escape hatches for bypassing the sidecar because some traffic (e.g., health probes, localhost debugging) should not be intercepted",
      "Do monitor sidecar resource consumption separately because sidecar CPU and memory usage can be significant at scale"
    ],
    "dos_zh": [
      "保持边车轻量和单一职责，因为臃肿的边车会为每个Pod增加延迟和资源开销",
      "独立于应用对边车进行版本管理和发布，因为耦合它们的发布周期违背了分离的初衷",
      "提供绕过边车的逃生通道，因为某些流量（如健康探针、localhost调试）不应被拦截",
      "单独监控边车的资源消耗，因为大规模部署下边车的CPU和内存使用可能相当可观"
    ],
    "donts": [
      "Don't put business logic in the sidecar because it should contain only infrastructure concerns; mixing concerns creates tight coupling",
      "Don't assume zero-cost proxying because each sidecar adds at least one network hop of latency and consumes CPU for TLS handshakes and header parsing",
      "Don't deploy sidecars without a control plane because managing hundreds of independently configured sidecars becomes operationally intractable",
      "Don't ignore sidecar startup ordering because if the sidecar is not ready before the application starts, initial requests will fail"
    ],
    "donts_zh": [
      "不要在边车中放置业务逻辑，它应只包含基础设施关注点；混合关注点会造成紧耦合",
      "不要假设代理零成本，每个边车至少增加一次网络跳转延迟并消耗CPU用于TLS握手和头部解析",
      "不要在没有控制平面的情况下部署边车，因为管理数百个独立配置的边车在运维上不可行",
      "不要忽视边车启动顺序，如果边车在应用启动前未就绪，初始请求将失败"
    ],
    "case_study_company": "Lyft",
    "case_study": "Lyft built Envoy as an internal sidecar proxy to solve the challenge of operating a polyglot microservices fleet (Python, Go, C++) with consistent observability and resilience. Before Envoy, each language ecosystem maintained its own HTTP client library with inconsistent retry logic, timeout handling, and metrics emission. By deploying Envoy as a sidecar in every pod, Lyft achieved uniform distributed tracing, automatic retries with circuit breaking, and consistent latency metrics across all services regardless of language. Envoy handles over 3 million requests per second at Lyft and was donated to the CNCF, becoming the foundation for Istio and other service mesh implementations.",
    "case_study_zh": "Lyft构建Envoy作为内部边车代理，解决运营多语言微服务集群（Python、Go、C++）时需要一致可观测性和弹性的挑战。在Envoy之前，每种语言生态各自维护HTTP客户端库，重试逻辑、超时处理和指标收集都不一致。通过在每个Pod中部署Envoy边车，Lyft实现了统一的分布式追踪、带熔断的自动重试以及跨所有服务一致的延迟指标，无论使用何种语言。Envoy在Lyft每秒处理超过300万请求，后捐赠给CNCF，成为Istio和其他服务网格实现的基础。",
    "when_not_to_use": [
      "Monolithic applications where cross-cutting concerns are handled within the application framework itself",
      "Extremely latency-sensitive paths where even the sub-millisecond overhead of a local proxy is unacceptable",
      "Small deployments with only a few services where the operational complexity of managing sidecars outweighs the benefits"
    ],
    "when_not_to_use_zh": [
      "横切关注点在应用框架内部处理的单体应用",
      "即使本地代理的亚毫秒级开销也不可接受的极端延迟敏感路径",
      "只有少量服务的小型部署，管理边车的运维复杂性超过其收益"
    ],
    "adopters": [
      "Lyft (Envoy)",
      "Google (Istio)",
      "Airbnb",
      "Stripe",
      "Shopify"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability",
      "scalability"
    ],
    "maturity_ring": "established",
    "primary_source": "Burns, B. (2018). \"Designing Distributed Systems: Patterns and Paradigms for Scalable, Reliable Services\". O'Reilly Media.",
    "secondary_sources": [
      "Newman, S. (2021). \"Building Microservices\", 2nd ed., Ch. 8. O'Reilly Media.",
      "Microsoft Azure Architecture Center (2016). \"Sidecar Pattern\". docs.microsoft.com.",
      "Nygard, M.T. (2018). \"Release It!\", 2nd ed., Ch. 17. Pragmatic Bookshelf."
    ],
    "typed_relations": [
      {
        "slug": "service-mesh-pattern",
        "type": "complement"
      },
      {
        "slug": "hexagonal-architecture",
        "type": "complement"
      },
      {
        "slug": "bulkhead-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 125,
    "name": "Leader Election",
    "name_zh": "领导者选举",
    "slug": "leader-election",
    "category": "distributed",
    "desc": "Coordinate a single leader node among distributed peers to avoid conflicts",
    "desc_zh": "在分布式对等节点中协调选出单一领导者以避免冲突",
    "steps": [
      "Define the leadership scope: determine what resource or responsibility the leader controls (e.g., write coordination, task scheduling, partition ownership)",
      "Select an election mechanism: choose between consensus-based election (Raft/ZooKeeper), lease-based election (DynamoDB lock, etcd lease), or bully/ring algorithms for simpler topologies",
      "Implement leader heartbeats: the leader periodically renews a lease or sends heartbeats so followers can detect leader failure within a bounded time",
      "Handle leader failure: when the heartbeat or lease expires, remaining nodes initiate a new election using the chosen mechanism to converge on a single new leader",
      "Protect against split-brain: use fencing tokens, epoch numbers, or distributed locks to ensure that a deposed leader cannot continue acting as leader after a new one is elected"
    ],
    "steps_zh": [
      "定义领导权范围：确定领导者控制什么资源或职责（如写协调、任务调度、分区所有权）",
      "选择选举机制：在基于共识的选举（Raft/ZooKeeper）、基于租约的选举（DynamoDB锁、etcd租约）或适用于简单拓扑的霸道/环形算法之间选择",
      "实现领导者心跳：领导者定期续约租约或发送心跳，以便跟随者在有界时间内检测到领导者故障",
      "处理领导者故障：当心跳或租约过期时，剩余节点使用所选机制发起新选举，收敛到单一新领导者",
      "防止脑裂：使用防护令牌、纪元编号或分布式锁确保被废黜的领导者在新领导者选出后无法继续充当领导者"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Leadership Scope",
      "Election Mechanism",
      "Heartbeat",
      "Failure Detect",
      "Fencing Token"
    ],
    "viz_labels_zh": [
      "领导范围",
      "选举机制",
      "心跳",
      "故障检测",
      "隔离令牌"
    ],
    "related": [
      "consensus-protocols",
      "cap-theorem",
      "two-phase-commit"
    ],
    "tags": [
      "coordination",
      "fault-tolerance",
      "leadership",
      "distributed-locking"
    ],
    "origin_author": "Garcia-Molina (Bully algorithm, 1982); Lamport (Paxos-based election, 1998)",
    "origin_source": "Elections in a Distributed Computing System (IEEE, 1982); Kleppmann, Designing Data-Intensive Applications, Ch. 8 & 9; Nygard, Release It!, Ch. 12",
    "origin_source_zh": "《分布式计算系统中的选举》（IEEE，1982）；Kleppmann《数据密集型应用系统设计》第8章和第9章；Nygard《Release It!》第12章",
    "complexity": "advanced",
    "when_to_use": [
      "When exactly one node must coordinate writes or task assignment to prevent conflicts or duplicate work",
      "When a distributed scheduler or cron-like system needs a single active instance to avoid running duplicate jobs",
      "When implementing master-slave replication where one node must be the authoritative writer at any given time",
      "When partitioned data processing requires a single coordinator to assign work units to workers without overlap"
    ],
    "when_to_use_zh": [
      "当恰好一个节点必须协调写入或任务分配以防止冲突或重复工作时",
      "当分布式调度器或类cron系统需要单一活跃实例以避免运行重复作业时",
      "当实现主从复制且在任意时刻需要一个权威写入节点时",
      "当分区数据处理需要单一协调者向工作者分配无重叠的工作单元时"
    ],
    "core_concepts": [
      "Lease-based leadership: The leader holds a time-bounded lease that must be periodically renewed; if the lease expires, other nodes can claim leadership",
      "Fencing tokens: Monotonically increasing tokens issued with each new leader that allow storage systems to reject stale writes from former leaders",
      "Split-brain prevention: Mechanisms to ensure at most one leader is active at any time, even during network partitions, typically requiring a quorum or external lock service",
      "Graceful handoff: The outgoing leader completes in-flight operations and releases its lease cleanly before the new leader takes over, minimizing disruption",
      "Leader liveness detection: Heartbeat intervals and timeout thresholds that balance fast failure detection against false positives from temporary network glitches"
    ],
    "core_concepts_zh": [
      "基于租约的领导权：领导者持有有时间限制的租约，必须定期续约；如果租约过期，其他节点可以竞选领导权",
      "防护令牌：每次新领导者产生时发放的单调递增令牌，允许存储系统拒绝来自前任领导者的过期写入",
      "脑裂防护：确保在任何时刻最多只有一个活跃领导者的机制，即使在网络分区期间也是如此，通常需要法定人数或外部锁服务",
      "优雅交接：即将卸任的领导者完成进行中的操作并干净地释放租约，然后新领导者接管，最小化中断",
      "领导者活性检测：心跳间隔和超时阈值，在快速故障检测与临时网络抖动引起的误报之间取得平衡"
    ],
    "timeline": [
      [
        "1982",
        "Hector Garcia-Molina publishes the Bully algorithm and Ring algorithm for leader election in distributed systems"
      ],
      [
        "1998",
        "Lamport's Paxos provides a consensus-based foundation for leader election in asynchronous systems"
      ],
      [
        "2010",
        "Apache ZooKeeper's ephemeral nodes and sequential znodes become the de facto leader election primitive in the Hadoop ecosystem"
      ],
      [
        "2014",
        "Raft formalizes leader election as an explicit sub-protocol with randomized timeouts for simplicity"
      ],
      [
        "2017",
        "Kleppmann's DDIA warns about the dangers of leader election without fencing tokens, citing real-world split-brain incidents"
      ]
    ],
    "timeline_zh": [
      [
        "1982",
        "Hector Garcia-Molina发表分布式系统中领导者选举的霸道算法和环形算法"
      ],
      [
        "1998",
        "Lamport的Paxos为异步系统中的领导者选举提供基于共识的基础"
      ],
      [
        "2010",
        "Apache ZooKeeper的临时节点和顺序znode成为Hadoop生态中事实上的领导者选举原语"
      ],
      [
        "2014",
        "Raft将领导者选举形式化为带随机超时的显式子协议，追求简洁性"
      ],
      [
        "2017",
        "Kleppmann的《数据密集型应用系统设计》警告不使用防护令牌进行领导者选举的危险，引用真实的脑裂事故"
      ]
    ],
    "dos": [
      "Do use fencing tokens for all leader-gated operations because a network-partitioned former leader may not know it has been replaced",
      "Do set lease timeouts carefully: too short causes unnecessary re-elections during GC pauses, too long delays failover",
      "Do implement graceful leader shutdown that releases the lease proactively because it enables faster handoff without waiting for lease expiry",
      "Do test leader election under network partition scenarios because correctness bugs only surface when split-brain conditions actually occur"
    ],
    "dos_zh": [
      "对所有领导者门控操作使用防护令牌，因为被网络分区隔离的前任领导者可能不知道自己已被替换",
      "仔细设置租约超时：过短会在GC暂停期间导致不必要的重新选举，过长则延迟故障转移",
      "实现主动释放租约的优雅领导者关闭，因为这能实现更快的交接而无需等待租约过期",
      "在网络分区场景下测试领导者选举，因为正确性缺陷只在脑裂条件实际发生时才会浮现"
    ],
    "donts": [
      "Don't rely solely on heartbeat absence to detect leader failure because network delays and GC pauses can cause false positives leading to unnecessary elections",
      "Don't assume the leader election result is globally visible immediately because propagation delays mean some nodes may still believe the old leader is active",
      "Don't use leader election when the workload can be partitioned among all nodes because a single leader becomes a throughput bottleneck",
      "Don't implement your own leader election from scratch when proven implementations exist because subtle correctness bugs in election protocols are notoriously difficult to detect"
    ],
    "donts_zh": [
      "不要仅依赖心跳缺失来检测领导者故障，因为网络延迟和GC暂停会导致误报从而引发不必要的选举",
      "不要假设领导者选举结果对所有节点立即可见，因为传播延迟意味着某些节点可能仍认为旧领导者活跃",
      "不要在工作负载可以在所有节点间分区时使用领导者选举，因为单一领导者会成为吞吐瓶颈",
      "不要在已有经过验证的实现时从头实现领导者选举，因为选举协议中微妙的正确性缺陷出了名地难以检测"
    ],
    "case_study_company": "Apache Kafka",
    "case_study": "Apache Kafka uses leader election to assign a single broker as the leader for each topic partition. Prior to KRaft (Kafka Raft), Kafka relied on Apache ZooKeeper for controller election, where one broker served as the cluster controller responsible for partition leader assignment. When a broker failed, ZooKeeper's session timeout triggered controller failover, which then reassigned partition leaders. This process could take 30+ seconds for large clusters. With the introduction of KRaft in Kafka 3.3 (2022), Kafka eliminated the ZooKeeper dependency by implementing Raft-based leader election internally, reducing controller failover time to under 10 seconds and simplifying the operational model from two distributed systems to one.",
    "case_study_zh": "Apache Kafka使用领导者选举为每个主题分区分配单一代理作为领导者。在KRaft（Kafka Raft）之前，Kafka依赖Apache ZooKeeper进行控制器选举，一个代理充当集群控制器负责分区领导者分配。当代理故障时，ZooKeeper的会话超时触发控制器故障转移，然后重新分配分区领导者。对于大型集群此过程可能需要30秒以上。随着Kafka 3.3（2022）引入KRaft，Kafka消除了ZooKeeper依赖，通过内部实现基于Raft的领导者选举，将控制器故障转移时间缩短到10秒以内，并将运维模型从两个分布式系统简化为一个。",
    "when_not_to_use": [
      "Stateless services behind a load balancer where any instance can handle any request without coordination",
      "Peer-to-peer systems designed for symmetric participation where introducing a leader adds unnecessary centralization",
      "Systems where brief periods of duplicate processing are acceptable and cheaper than the complexity of leader election"
    ],
    "when_not_to_use_zh": [
      "负载均衡器后的无状态服务，任何实例都能处理任何请求而无需协调",
      "设计为对称参与的点对点系统，引入领导者会增加不必要的集中化",
      "短暂的重复处理可接受且成本低于领导者选举复杂性的系统"
    ],
    "adopters": [
      "Apache Kafka (KRaft)",
      "Apache ZooKeeper",
      "etcd",
      "Redis Sentinel",
      "Elasticsearch"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Garcia-Molina, H. (1982). \"Elections in a Distributed Computing System\". IEEE Transactions on Computers, C-31(1).",
    "secondary_sources": [
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 8 & 9. O'Reilly Media.",
      "Lamport, L. (1998). \"The Part-Time Parliament\". ACM Transactions on Computer Systems, 16(2)."
    ],
    "typed_relations": [
      {
        "slug": "consensus-protocols",
        "type": "extends"
      },
      {
        "slug": "cap-theorem",
        "type": "prerequisite"
      },
      {
        "slug": "two-phase-commit",
        "type": "related"
      }
    ]
  },
  {
    "id": 126,
    "name": "Consistent Hashing",
    "name_zh": "一致性哈希",
    "slug": "consistent-hashing",
    "category": "distributed",
    "desc": "Distribute data across nodes with minimal redistribution when the cluster changes",
    "desc_zh": "在集群变更时以最小重新分配量将数据分布到各节点",
    "steps": [
      "Define the hash ring: map both data keys and node identifiers onto the same circular hash space (typically 0 to 2^32-1) using a uniform hash function",
      "Assign keys to nodes: each key is assigned to the first node encountered when walking clockwise around the ring from the key's hash position",
      "Add virtual nodes: map each physical node to multiple positions on the ring (e.g., 100-200 virtual nodes per physical node) to ensure uniform load distribution",
      "Handle node addition: when a new node joins, it claims a portion of the ring; only the keys in the affected arc need to migrate from the successor node",
      "Handle node removal: when a node leaves, its keys are redistributed to the next node clockwise on the ring, affecting only a fraction of the total keys"
    ],
    "steps_zh": [
      "定义哈希环：使用均匀哈希函数将数据键和节点标识符映射到相同的环形哈希空间（通常为0到2^32-1）",
      "将键分配给节点：每个键被分配给从该键的哈希位置沿环顺时针方向遇到的第一个节点",
      "添加虚拟节点：将每个物理节点映射到环上的多个位置（如每个物理节点100-200个虚拟节点）以确保负载均匀分布",
      "处理节点加入：新节点加入时声明环上的一段弧；只有受影响弧段上的键需要从后继节点迁移",
      "处理节点移除：节点离开时其键重新分配给环上顺时针方向的下一个节点，只影响总键数的一小部分"
    ],
    "ai_relevant": false,
    "viz_type": "cycle",
    "viz_labels": [
      "Hash Ring",
      "Key Assign",
      "Virtual Nodes",
      "Node Add",
      "Node Remove"
    ],
    "viz_labels_zh": [
      "哈希环",
      "键分配",
      "虚拟节点",
      "节点加入",
      "节点移除"
    ],
    "related": [
      "sharding-strategies",
      "gossip-protocol",
      "cap-theorem"
    ],
    "tags": [
      "partitioning",
      "load-balancing",
      "hashing",
      "scalability"
    ],
    "origin_author": "David Karger, Eric Lehman, Tom Leighton, et al., 1997",
    "origin_source": "Consistent Hashing and Random Trees (ACM STOC 1997); Kleppmann, Designing Data-Intensive Applications, Ch. 6; Dynamo: Amazon's Highly Available Key-Value Store (SOSP 2007)",
    "origin_source_zh": "《一致性哈希与随机树》（ACM STOC 1997）；Kleppmann《数据密集型应用系统设计》第6章；《Dynamo：Amazon高可用键值存储》（SOSP 2007）",
    "complexity": "intermediate",
    "when_to_use": [
      "When distributing cached data across a cluster of cache servers and nodes are frequently added or removed",
      "When building a distributed key-value store that must rebalance data with minimal migration on cluster resize",
      "When implementing client-side load balancing where a consistent mapping from request keys to backend servers is needed",
      "When designing a CDN or object storage system where content must be distributed across edge nodes predictably"
    ],
    "when_to_use_zh": [
      "当在缓存服务器集群间分布缓存数据且节点频繁增减时",
      "当构建分布式键值存储，需要在集群扩缩容时以最小迁移量重新平衡数据时",
      "当实现客户端负载均衡，需要从请求键到后端服务器的一致映射时",
      "当设计CDN或对象存储系统，内容必须可预测地分布到边缘节点时"
    ],
    "core_concepts": [
      "Hash ring: A circular hash space where the output of the hash function wraps around, so the space has no endpoints and every position has a successor",
      "Virtual nodes (vnodes): Multiple hash positions per physical node that smooth out distribution imbalances caused by non-uniform hash clustering",
      "Minimal disruption: When a node joins or leaves, only O(K/N) keys need to move (K total keys, N nodes), compared to O(K) with naive modular hashing",
      "Replication via ring walking: Data is replicated by storing copies on the next R distinct physical nodes encountered clockwise, providing fault tolerance",
      "Bounded load extension: Google's 2017 improvement that caps the maximum load on any node to (1+epsilon) times the average, preventing hot spots"
    ],
    "core_concepts_zh": [
      "哈希环：环形哈希空间，哈希函数输出首尾相接，空间没有端点，每个位置都有后继",
      "虚拟节点（vnodes）：每个物理节点在环上占据多个哈希位置，平滑非均匀哈希聚集导致的分布不均",
      "最小扰动：节点加入或离开时只需移动O(K/N)个键（K为总键数，N为节点数），而朴素取模哈希需要O(K)",
      "基于环遍历的复制：通过将副本存储在顺时针方向遇到的下R个不同物理节点上提供容错",
      "有界负载扩展：Google 2017年的改进，将任意节点的最大负载限制在平均值的(1+epsilon)倍以内，防止热点"
    ],
    "timeline": [
      [
        "1997",
        "Karger et al. publish Consistent Hashing and Random Trees at ACM STOC, originally motivated by web caching"
      ],
      [
        "2007",
        "Amazon's Dynamo paper uses consistent hashing with virtual nodes for its distributed key-value store"
      ],
      [
        "2008",
        "Apache Cassandra adopts consistent hashing as its data partitioning strategy, following the Dynamo design"
      ],
      [
        "2014",
        "Consistent hashing becomes standard in memcached clients (libmemcached, ketama) for distributed cache clusters"
      ],
      [
        "2017",
        "Google publishes Consistent Hashing with Bounded Loads, addressing the load imbalance problem in practical deployments"
      ]
    ],
    "timeline_zh": [
      [
        "1997",
        "Karger等人在ACM STOC发表《一致性哈希与随机树》，最初动机来自Web缓存"
      ],
      [
        "2007",
        "Amazon的Dynamo论文使用带虚拟节点的一致性哈希作为分布式键值存储的分区策略"
      ],
      [
        "2008",
        "Apache Cassandra采用一致性哈希作为数据分区策略，遵循Dynamo设计"
      ],
      [
        "2014",
        "一致性哈希成为memcached客户端（libmemcached、ketama）在分布式缓存集群中的标准"
      ],
      [
        "2017",
        "Google发表「有界负载的一致性哈希」，解决实际部署中的负载不均衡问题"
      ]
    ],
    "dos": [
      "Do use a sufficient number of virtual nodes per physical node (100+) because too few virtual nodes leads to uneven key distribution",
      "Do choose a hash function with good uniformity (like xxHash or MurmurHash) because poor distribution creates hot spots regardless of virtual nodes",
      "Do consider heterogeneous node capacity by assigning more virtual nodes to more powerful machines to achieve proportional load distribution",
      "Do implement gradual key migration when nodes join because migrating all affected keys at once can cause load spikes on neighboring nodes"
    ],
    "dos_zh": [
      "为每个物理节点使用足够数量的虚拟节点（100+），因为虚拟节点太少会导致键分布不均",
      "选择具有良好均匀性的哈希函数（如xxHash或MurmurHash），因为分布差的函数无论虚拟节点多少都会产生热点",
      "考虑异构节点容量，为更强大的机器分配更多虚拟节点以实现按比例的负载分配",
      "节点加入时实现渐进式键迁移，因为一次性迁移所有受影响的键会在相邻节点造成负载尖峰"
    ],
    "donts": [
      "Don't use consistent hashing with very small clusters (2-3 nodes) because the overhead is unnecessary and simple partitioning suffices",
      "Don't forget about replication when implementing consistent hashing because hash ring assignment alone does not provide fault tolerance",
      "Don't assume consistent hashing eliminates all hot spots because skewed access patterns (a few very popular keys) still concentrate load on specific nodes",
      "Don't change the hash function after deployment because it invalidates the entire key-to-node mapping and forces a full data migration"
    ],
    "donts_zh": [
      "不要在非常小的集群（2-3个节点）中使用一致性哈希，因为开销不必要且简单分区即可满足",
      "实现一致性哈希时不要忘记复制，因为仅哈希环分配不提供容错能力",
      "不要假设一致性哈希能消除所有热点，因为倾斜的访问模式（少量非常热门的键）仍会将负载集中在特定节点",
      "部署后不要更改哈希函数，因为这会使整个键到节点的映射失效并强制全量数据迁移"
    ],
    "case_study_company": "Akamai",
    "case_study": "Consistent hashing was originally invented by Karger et al. at MIT in collaboration with Akamai Technologies to solve the web content caching problem. Akamai's CDN needed to distribute cached web objects across thousands of edge servers worldwide, and when servers were added or removed, traditional modular hashing would invalidate nearly all cache entries, causing a thundering herd of requests to origin servers. With consistent hashing, adding or removing a server affects only 1/N of the cached keys, keeping cache hit rates stable during cluster changes. This approach remains fundamental to Akamai's CDN, which serves over 30% of global web traffic.",
    "case_study_zh": "一致性哈希最初由MIT的Karger等人与Akamai Technologies合作发明，用于解决Web内容缓存问题。Akamai的CDN需要将缓存的Web对象分布到全球数千个边缘服务器上，当服务器增减时，传统的取模哈希会使几乎所有缓存条目失效，导致对源服务器的惊群请求。使用一致性哈希后，添加或移除服务器只影响1/N的缓存键，在集群变更期间保持缓存命中率稳定。这种方法至今仍是Akamai CDN的基础，该CDN承载了全球超过30%的Web流量。",
    "when_not_to_use": [
      "Static clusters that never change size, where simple range or modular partitioning is sufficient and easier to reason about",
      "Workloads requiring strict ordering or range queries, where hash-based partitioning scatters related keys across nodes",
      "Single-node systems or systems with a dedicated partitioning coordinator that can use more sophisticated placement algorithms"
    ],
    "when_not_to_use_zh": [
      "永远不会改变大小的静态集群，简单的范围或取模分区已足够且更易推理",
      "需要严格排序或范围查询的工作负载，基于哈希的分区会将相关键分散到不同节点",
      "单节点系统或有专用分区协调器可以使用更复杂放置算法的系统"
    ],
    "adopters": [
      "Akamai",
      "Apache Cassandra",
      "Amazon DynamoDB",
      "Memcached (ketama)",
      "Riak"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "scalability",
      "performance"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Karger, D., Lehman, E., Leighton, T. et al. (1997). \"Consistent Hashing and Random Trees: Distributed Caching Protocols for Relieving Hot Spots on the World Wide Web\". Proceedings of ACM STOC.",
    "secondary_sources": [
      "DeCandia, G. et al. (2007). \"Dynamo: Amazon's Highly Available Key-Value Store\". Proceedings of ACM SOSP.",
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 6. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "sharding-strategies",
        "type": "complement"
      },
      {
        "slug": "gossip-protocol",
        "type": "complement"
      },
      {
        "slug": "cap-theorem",
        "type": "prerequisite"
      }
    ]
  },
  {
    "id": 127,
    "name": "Gossip Protocol",
    "name_zh": "Gossip协议",
    "slug": "gossip-protocol",
    "category": "distributed",
    "desc": "Epidemic-style information dissemination for decentralized cluster communication",
    "desc_zh": "以流行病传播方式在去中心化集群中进行信息扩散",
    "steps": [
      "Bootstrap the protocol: each node maintains a partial membership list of known peers and seeds the list from a configuration file or DNS-based discovery",
      "Select gossip targets: at each gossip interval (e.g., every 1 second), each node randomly selects a fixed number of peers (fanout, typically 2-3) to exchange state with",
      "Exchange state digests: nodes send compact digests of their known state (version vectors, heartbeat counters, or Bloom filters) to detect what the peer is missing",
      "Reconcile differences: after comparing digests, nodes exchange only the delta — new or updated entries — bringing both nodes closer to a shared view of the cluster",
      "Detect failures: use the accumulated heartbeat information to mark nodes as suspected or confirmed down after a configurable number of missed gossip rounds"
    ],
    "steps_zh": [
      "引导协议：每个节点维护已知对等节点的部分成员列表，从配置文件或基于DNS的发现初始化该列表",
      "选择gossip目标：在每个gossip间隔（如每1秒），每个节点随机选择固定数量的对等节点（扇出，通常2-3个）进行状态交换",
      "交换状态摘要：节点发送其已知状态的紧凑摘要（版本向量、心跳计数器或布隆过滤器）以检测对方缺少的内容",
      "协调差异：比较摘要后，节点仅交换增量——新的或更新的条目——使两个节点更接近集群的共享视图",
      "检测故障：使用累积的心跳信息，在可配置数量的gossip轮次未收到响应后将节点标记为疑似或确认宕机"
    ],
    "ai_relevant": false,
    "viz_type": "cycle",
    "viz_labels": [
      "Membership List",
      "Gossip Targets",
      "State Digest",
      "Delta Reconcile",
      "Failure Detect"
    ],
    "viz_labels_zh": [
      "成员列表",
      "Gossip目标",
      "状态摘要",
      "差量同步",
      "故障检测"
    ],
    "related": [
      "consistent-hashing",
      "eventual-consistency",
      "leader-election"
    ],
    "tags": [
      "membership",
      "failure-detection",
      "decentralized",
      "epidemic"
    ],
    "origin_author": "Alan Demers, Dan Greene, Carl Hauser, et al. (Xerox PARC, 1987)",
    "origin_source": "Epidemic Algorithms for Replicated Database Maintenance (ACM PODC 1987); Kleppmann, Designing Data-Intensive Applications, Ch. 5; van Renesse et al., Efficient Reconciliation and Flow Control for Anti-Entropy Protocols (2008)",
    "origin_source_zh": "《复制数据库维护的流行病算法》（ACM PODC 1987）；Kleppmann《数据密集型应用系统设计》第5章；van Renesse等《反熵协议的高效协调与流控》（2008）",
    "complexity": "intermediate",
    "when_to_use": [
      "When cluster membership information must propagate to all nodes without a centralized membership service",
      "When the system needs a decentralized failure detector that works without a single point of failure",
      "When metadata (configuration, schema versions, feature flags) must eventually reach all nodes in a large cluster",
      "When building peer-to-peer systems where no node has a privileged role and information must spread organically"
    ],
    "when_to_use_zh": [
      "当集群成员信息必须在没有集中式成员服务的情况下传播到所有节点时",
      "当系统需要无单点故障的去中心化故障检测器时",
      "当元数据（配置、模式版本、特性开关）必须最终到达大型集群中的所有节点时",
      "当构建没有特权角色节点、信息必须有机扩散的点对点系统时"
    ],
    "core_concepts": [
      "Epidemic dissemination: Information spreads like a disease — each informed node infects a few others per round, achieving O(log N) convergence time for N nodes",
      "Fanout: The number of random peers each node contacts per gossip round; higher fanout speeds convergence but increases network traffic",
      "Cramer-von Mises convergence: Mathematical guarantee that all nodes receive the update with high probability after O(log N) rounds with constant fanout",
      "SWIM protocol: Scalable Weakly-consistent Infection-style Membership protocol that combines gossip with direct and indirect probes for efficient failure detection",
      "Anti-entropy: A complementary mechanism where nodes periodically do full state reconciliation (using Merkle trees) to repair any inconsistencies that gossip missed"
    ],
    "core_concepts_zh": [
      "流行病扩散：信息像疾病一样传播——每轮中每个已知节点感染几个其他节点，对N个节点实现O(log N)的收敛时间",
      "扇出：每轮gossip中每个节点联系的随机对等节点数量；更高的扇出加快收敛但增加网络流量",
      "收敛保证：数学上保证在常数扇出下经过O(log N)轮后所有节点以高概率接收到更新",
      "SWIM协议：可扩展弱一致感染式成员协议，将gossip与直接和间接探测结合以实现高效的故障检测",
      "反熵：补充机制，节点定期进行完整的状态协调（使用Merkle树）以修复gossip遗漏的任何不一致"
    ],
    "timeline": [
      [
        "1987",
        "Demers et al. at Xerox PARC publish Epidemic Algorithms for Replicated Database Maintenance"
      ],
      [
        "2002",
        "Das, Gupta, and Muthukrishnan introduce SWIM, a gossip-based membership protocol used in many modern systems"
      ],
      [
        "2007",
        "Amazon's Dynamo paper describes using gossip for membership and failure detection in production"
      ],
      [
        "2012",
        "HashiCorp's Serf and later Consul adopt SWIM-based gossip (memberlist library) for cluster membership"
      ],
      [
        "2017",
        "Kleppmann's DDIA contextualizes gossip protocols within the broader landscape of distributed systems primitives"
      ]
    ],
    "timeline_zh": [
      [
        "1987",
        "Xerox PARC的Demers等人发表「复制数据库维护的流行病算法」"
      ],
      [
        "2002",
        "Das、Gupta和Muthukrishnan提出SWIM，一种被许多现代系统使用的基于gossip的成员协议"
      ],
      [
        "2007",
        "Amazon的Dynamo论文描述在生产环境中使用gossip进行成员管理和故障检测"
      ],
      [
        "2012",
        "HashiCorp的Serf和后来的Consul采用基于SWIM的gossip（memberlist库）进行集群成员管理"
      ],
      [
        "2017",
        "Kleppmann的《数据密集型应用系统设计》将gossip协议置于分布式系统原语的更广阔背景中进行介绍"
      ]
    ],
    "dos": [
      "Do tune the gossip interval and fanout based on cluster size because defaults optimized for 10 nodes may not work for 10,000 nodes",
      "Do use compact state digests (Bloom filters, version vectors) because sending full state on every gossip round wastes bandwidth at scale",
      "Do combine gossip with anti-entropy reconciliation because gossip alone can leave a small probability of nodes missing updates indefinitely",
      "Do implement suspicion mechanisms (like SWIM's indirect probe) because immediately marking a node as dead on one missed heartbeat leads to flapping"
    ],
    "dos_zh": [
      "根据集群大小调整gossip间隔和扇出，因为针对10个节点优化的默认值可能不适用于10000个节点",
      "使用紧凑的状态摘要（布隆过滤器、版本向量），因为每轮gossip发送完整状态在大规模下浪费带宽",
      "将gossip与反熵协调结合，因为仅靠gossip可能留下节点无限期遗漏更新的小概率",
      "实现怀疑机制（如SWIM的间接探测），因为一次心跳缺失就将节点标记为宕机会导致状态抖动"
    ],
    "donts": [
      "Don't use gossip for data that requires strong consistency because gossip provides only eventual convergence with no ordering guarantees",
      "Don't set the gossip interval too aggressively in large clusters because the O(N * fanout) messages per interval can create significant network overhead",
      "Don't assume gossip propagation is instantaneous because in a 1000-node cluster with 1-second intervals, full propagation may take 10+ seconds",
      "Don't gossip large payloads (e.g., entire database snapshots) because gossip is designed for small metadata, not bulk data transfer"
    ],
    "donts_zh": [
      "不要将gossip用于需要强一致性的数据，因为gossip仅提供最终收敛且没有排序保证",
      "不要在大型集群中设置过于激进的gossip间隔，因为每个间隔O(N * fanout)条消息会产生显著的网络开销",
      "不要假设gossip传播是即时的，在1000个节点1秒间隔的集群中，完全传播可能需要10秒以上",
      "不要通过gossip传播大型负载（如完整数据库快照），因为gossip设计用于小型元数据而非批量数据传输"
    ],
    "case_study_company": "Apache Cassandra",
    "case_study": "Apache Cassandra uses a gossip protocol for cluster membership, failure detection, and schema dissemination. Every second, each Cassandra node selects up to three peers to exchange gossip state, including heartbeat counters, datacenter/rack topology, schema versions, and load information. This decentralized approach means Cassandra has no single point of failure for cluster coordination — any node can serve as a contact point for clients. When a node fails, gossip-based failure detection (using a phi-accrual failure detector built on gossip heartbeats) propagates the failure information across the cluster within seconds, allowing the remaining nodes to take over the failed node's token ranges without a centralized coordinator.",
    "case_study_zh": "Apache Cassandra使用gossip协议进行集群成员管理、故障检测和模式传播。每秒钟，每个Cassandra节点选择最多三个对等节点交换gossip状态，包括心跳计数器、数据中心/机架拓扑、模式版本和负载信息。这种去中心化方法意味着Cassandra在集群协调上没有单点故障——任何节点都可以作为客户端的联系点。当节点故障时，基于gossip的故障检测（使用构建在gossip心跳上的phi累积故障检测器）在数秒内将故障信息传播到整个集群，使剩余节点无需集中式协调器即可接管故障节点的令牌范围。",
    "when_not_to_use": [
      "When strong consistency and total ordering of events is required, because gossip provides only best-effort dissemination",
      "Small clusters (under 5 nodes) where a centralized membership service is simpler and equally reliable",
      "When propagation latency must be bounded and deterministic, because gossip convergence is probabilistic"
    ],
    "when_not_to_use_zh": [
      "当需要强一致性和事件全序时，因为gossip仅提供尽力而为的信息扩散",
      "小型集群（少于5个节点），集中式成员服务更简单且同样可靠",
      "当传播延迟必须有界和确定性时，因为gossip的收敛是概率性的"
    ],
    "adopters": [
      "Apache Cassandra",
      "HashiCorp Consul",
      "Amazon DynamoDB",
      "ScyllaDB",
      "Redis Cluster"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "scalability",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Demers, A., Greene, D., Hauser, C. et al. (1987). \"Epidemic Algorithms for Replicated Database Maintenance\". Proceedings of ACM PODC.",
    "secondary_sources": [
      "van Renesse, R., Minsky, Y. & Hayden, M. (2008). \"Efficient Reconciliation and Flow Control for Anti-Entropy Protocols\". Cornell University.",
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 5. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "consistent-hashing",
        "type": "complement"
      },
      {
        "slug": "eventual-consistency",
        "type": "extends"
      },
      {
        "slug": "leader-election",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 128,
    "name": "Two-Phase Commit (2PC)",
    "name_zh": "两阶段提交（2PC）",
    "slug": "two-phase-commit",
    "category": "distributed",
    "desc": "Atomic commit protocol ensuring all-or-nothing transaction outcomes across distributed participants",
    "desc_zh": "确保分布式参与者之间事务要么全部提交要么全部回滚的原子提交协议",
    "steps": [
      "Phase 1 — Prepare: The coordinator sends a PREPARE message to all participants, asking each to vote on whether it can commit the transaction",
      "Participant vote: Each participant performs all transaction work (writes to WAL, acquires locks) and responds with VOTE-COMMIT if ready or VOTE-ABORT if not",
      "Coordinator decision: If all participants vote COMMIT, the coordinator writes a COMMIT decision to its own durable log; if any participant votes ABORT, the coordinator decides ABORT",
      "Phase 2 — Commit/Abort: The coordinator sends the decision (COMMIT or ABORT) to all participants, who apply or roll back accordingly and acknowledge",
      "Recovery: If the coordinator crashes between phases, participants that voted COMMIT remain in a blocked (in-doubt) state until the coordinator recovers and resends the decision"
    ],
    "steps_zh": [
      "阶段一——准备：协调者向所有参与者发送PREPARE消息，要求每个参与者投票决定是否能提交事务",
      "参与者投票：每个参与者执行所有事务工作（写入WAL、获取锁），如果准备好则回复VOTE-COMMIT，否则回复VOTE-ABORT",
      "协调者决策：如果所有参与者投票COMMIT，协调者将COMMIT决定写入自己的持久化日志；如果任何参与者投票ABORT，协调者决定ABORT",
      "阶段二——提交/回滚：协调者将决定（COMMIT或ABORT）发送给所有参与者，参与者相应地应用或回滚并确认",
      "恢复：如果协调者在两个阶段之间崩溃，已投票COMMIT的参与者将保持阻塞（不确定）状态，直到协调者恢复并重发决定"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Prepare",
      "Vote",
      "Coordinator Decision",
      "Commit / Abort",
      "Recovery"
    ],
    "viz_labels_zh": [
      "准备阶段",
      "投票",
      "协调者决定",
      "提交中止",
      "故障恢复"
    ],
    "related": [
      "saga-pattern",
      "eventual-consistency",
      "consensus-protocols"
    ],
    "tags": [
      "transactions",
      "atomicity",
      "coordination",
      "distributed-databases"
    ],
    "origin_author": "Jim Gray, 1978",
    "origin_source": "Notes on Data Base Operating Systems (1978); Kleppmann, Designing Data-Intensive Applications, Ch. 9; Bernstein, Hadzilacos, Goodman, Concurrency Control and Recovery in Database Systems (1987)",
    "origin_source_zh": "《数据库操作系统笔记》（1978）；Kleppmann《数据密集型应用系统设计》第9章；Bernstein、Hadzilacos、Goodman《数据库系统中的并发控制与恢复》（1987）",
    "complexity": "advanced",
    "when_to_use": [
      "When multiple databases or resource managers must commit or abort as a single atomic unit, such as in XA transactions",
      "When financial or regulatory requirements demand that cross-system updates are either fully applied or fully rolled back",
      "When coordinating writes across heterogeneous data stores (e.g., a relational database and a message queue) that must remain consistent",
      "When the number of participants is small and the transaction duration is short, minimizing the window for blocking"
    ],
    "when_to_use_zh": [
      "当多个数据库或资源管理器必须作为单一原子单元提交或回滚时，如XA事务",
      "当金融或监管要求跨系统更新必须完全应用或完全回滚时",
      "当协调必须保持一致的异构数据存储间的写入时（如关系数据库和消息队列）",
      "当参与者数量少且事务持续时间短，最小化阻塞窗口时"
    ],
    "core_concepts": [
      "Atomicity: The fundamental guarantee that either all participants commit or all abort — no partial outcomes are possible",
      "Blocking problem: If the coordinator fails after sending PREPARE but before sending the decision, participants holding locks are blocked indefinitely until the coordinator recovers",
      "Write-ahead logging: Both coordinator and participants durably log their decisions before sending messages, enabling recovery after crashes",
      "Presumed abort: An optimization where the coordinator does not log ABORT decisions; if a participant asks and finds no commit record, it aborts by default",
      "XA standard: The X/Open DTP model that defines the 2PC interface (xa_prepare, xa_commit, xa_rollback) for interoperability between transaction managers and resource managers"
    ],
    "core_concepts_zh": [
      "原子性：基本保证——要么所有参与者提交，要么所有参与者回滚，不可能出现部分结果",
      "阻塞问题：如果协调者在发送PREPARE之后、发送决定之前故障，持有锁的参与者将无限期阻塞直到协调者恢复",
      "预写日志：协调者和参与者在发送消息之前都将其决定持久化到日志，支持崩溃后恢复",
      "假定中止：一种优化，协调者不记录ABORT决定；如果参与者查询时未找到提交记录，则默认中止",
      "XA标准：X/Open DTP模型，定义了2PC接口（xa_prepare、xa_commit、xa_rollback），用于事务管理器和资源管理器之间的互操作"
    ],
    "timeline": [
      [
        "1978",
        "Jim Gray describes the two-phase commit protocol in Notes on Data Base Operating Systems"
      ],
      [
        "1987",
        "Bernstein, Hadzilacos, and Goodman formalize 2PC theory in Concurrency Control and Recovery in Database Systems"
      ],
      [
        "1991",
        "X/Open publishes the XA specification standardizing the 2PC interface for distributed transactions"
      ],
      [
        "2007",
        "Pat Helland's Life Beyond Distributed Transactions argues for moving away from 2PC toward compensating transactions"
      ],
      [
        "2017",
        "Kleppmann's DDIA critically examines 2PC's blocking nature and advocates for alternatives like sagas in most microservice architectures"
      ]
    ],
    "timeline_zh": [
      [
        "1978",
        "Jim Gray在《数据库操作系统笔记》中描述两阶段提交协议"
      ],
      [
        "1987",
        "Bernstein、Hadzilacos和Goodman在《数据库系统中的并发控制与恢复》中形式化2PC理论"
      ],
      [
        "1991",
        "X/Open发布XA规范，标准化分布式事务的2PC接口"
      ],
      [
        "2007",
        "Pat Helland的「分布式事务之外的生活」主张从2PC转向补偿事务"
      ],
      [
        "2017",
        "Kleppmann的《数据密集型应用系统设计》批判性地审视2PC的阻塞特性，在大多数微服务架构中倡导saga等替代方案"
      ]
    ],
    "dos": [
      "Do keep the prepare-to-commit window as short as possible because participants hold locks during this entire period, blocking other transactions",
      "Do implement robust recovery procedures for the coordinator because its failure is the most dangerous scenario in 2PC",
      "Do use 2PC only within a single trust boundary because coordinating 2PC across organizational boundaries creates unacceptable operational coupling",
      "Do monitor in-doubt transaction counts because even rare coordinator failures can leave orphaned locks that block entire tables"
    ],
    "dos_zh": [
      "将准备到提交的窗口保持尽可能短，因为参与者在此整个期间持有锁，阻塞其他事务",
      "为协调者实现健壮的恢复程序，因为协调者故障是2PC中最危险的场景",
      "仅在单一信任边界内使用2PC，因为跨组织边界协调2PC会产生不可接受的运维耦合",
      "监控不确定事务数量，因为即使罕见的协调者故障也可能留下阻塞整个表的孤立锁"
    ],
    "donts": [
      "Don't use 2PC across microservice boundaries in most cases because the blocking nature and tight coupling contradicts microservice autonomy",
      "Don't hold expensive resources (like database connections or row locks) during the prepare phase longer than necessary because it degrades overall system throughput",
      "Don't assume 2PC provides consensus because it only guarantees atomicity; a coordinator failure still blocks the protocol unlike true consensus algorithms",
      "Don't mix 2PC with long-running business processes because the lock duration becomes unbounded and starves concurrent operations"
    ],
    "donts_zh": [
      "大多数情况下不要跨微服务边界使用2PC，因为其阻塞特性和紧耦合与微服务自治性相矛盾",
      "不要在准备阶段持有昂贵资源（如数据库连接或行锁）超过必要时间，因为这会降低整体系统吞吐",
      "不要假设2PC提供共识，它只保证原子性；协调者故障仍会阻塞协议，不像真正的共识算法",
      "不要将2PC与长时间运行的业务流程混合，因为锁持续时间变得无界并饿死并发操作"
    ],
    "case_study_company": "Google Spanner",
    "case_study": "Google Spanner uses a variant of 2PC combined with Paxos to achieve globally distributed ACID transactions. Each Spanner shard is a Paxos group, and cross-shard transactions use 2PC where each participant shard commits through its own Paxos group. This hybrid eliminates 2PC's blocking problem: if the coordinator fails, the Paxos group that replicated the coordinator's state elects a new coordinator and completes the protocol. Spanner's TrueTime API (GPS + atomic clocks) provides globally synchronized timestamps that order transactions without the traditional 2PC lock contention. This architecture enables Google to run globally consistent transactions with single-digit millisecond latencies for workloads like Google Ads and Google Play, something previously considered impossible for geographically distributed systems.",
    "case_study_zh": "Google Spanner使用2PC的变体结合Paxos实现全球分布式ACID事务。每个Spanner分片是一个Paxos组，跨分片事务使用2PC，每个参与者分片通过自己的Paxos组提交。这种混合方式消除了2PC的阻塞问题：如果协调者故障，复制了协调者状态的Paxos组选出新的协调者并完成协议。Spanner的TrueTime API（GPS+原子钟）提供全球同步的时间戳来排序事务，无需传统2PC的锁竞争。这种架构使Google能够为Google Ads和Google Play等工作负载运行全球一致的事务且延迟在个位数毫秒级别，这在以前被认为对地理分布式系统是不可能的。",
    "when_not_to_use": [
      "Microservice architectures where service autonomy and independent deployability are more important than cross-service atomicity",
      "High-throughput systems where lock contention from the prepare phase would create unacceptable performance bottlenecks",
      "Systems spanning multiple organizations or trust boundaries where no single coordinator can be trusted by all parties"
    ],
    "when_not_to_use_zh": [
      "服务自治和独立部署能力比跨服务原子性更重要的微服务架构",
      "准备阶段的锁竞争会产生不可接受的性能瓶颈的高吞吐系统",
      "跨越多个组织或信任边界、没有单一协调者能被所有方信任的系统"
    ],
    "adopters": [
      "Google Spanner",
      "PostgreSQL (prepared transactions)",
      "Oracle Database (XA)",
      "MySQL (XA)",
      "Microsoft SQL Server (MSDTC)"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Gray, J. (1978). \"Notes on Data Base Operating Systems\". In: Operating Systems, An Advanced Course. Lecture Notes in Computer Science, Vol. 60. Springer.",
    "secondary_sources": [
      "Bernstein, P.A., Hadzilacos, V. & Goodman, N. (1987). \"Concurrency Control and Recovery in Database Systems\". Addison-Wesley.",
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 9. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "saga-pattern",
        "type": "alternative"
      },
      {
        "slug": "eventual-consistency",
        "type": "alternative"
      },
      {
        "slug": "consensus-protocols",
        "type": "extends"
      }
    ]
  },
  {
    "id": 129,
    "name": "Sharding Strategies",
    "name_zh": "分片策略",
    "slug": "sharding-strategies",
    "category": "distributed",
    "desc": "Horizontal data partitioning patterns for distributing load across multiple database nodes",
    "desc_zh": "将数据水平分区到多个数据库节点以分散负载的模式",
    "steps": [
      "Analyze access patterns: identify the most common queries, their selectivity, and which fields they filter on to determine the optimal shard key",
      "Choose a sharding strategy: select hash-based sharding for uniform distribution, range-based sharding for range queries, or directory-based sharding for complex routing",
      "Define the shard key: select a high-cardinality field (e.g., user_id, tenant_id) that distributes writes evenly and keeps related data co-located for common query patterns",
      "Implement query routing: build a routing layer (application-level, proxy, or coordinator) that directs each query to the correct shard(s) based on the shard key",
      "Plan for rebalancing: design the migration strategy for splitting hot shards or redistributing data when adding nodes, using techniques like consistent hashing or logical sharding"
    ],
    "steps_zh": [
      "分析访问模式：识别最常见的查询、其选择性以及过滤的字段，以确定最优分片键",
      "选择分片策略：选择基于哈希的分片实现均匀分布、基于范围的分片支持范围查询，或基于目录的分片实现复杂路由",
      "定义分片键：选择高基数字段（如user_id、tenant_id），确保写入均匀分布且相关数据对常见查询模式保持共置",
      "实现查询路由：构建路由层（应用级、代理或协调器），根据分片键将每个查询定向到正确的分片",
      "规划重新平衡：设计分割热分片或在添加节点时重新分配数据的迁移策略，使用一致性哈希或逻辑分片等技术"
    ],
    "ai_relevant": false,
    "viz_type": "tree",
    "viz_labels": [
      "Hash Shard",
      "Range Shard",
      "Directory Shard",
      "Query Router",
      "Rebalance"
    ],
    "viz_labels_zh": [
      "哈希分片",
      "范围分片",
      "目录分片",
      "查询路由",
      "再平衡"
    ],
    "related": [
      "consistent-hashing",
      "cap-theorem",
      "eventual-consistency"
    ],
    "tags": [
      "partitioning",
      "scalability",
      "databases",
      "horizontal-scaling"
    ],
    "origin_author": "Concept from database research (1980s-1990s); formalized in modern context by Kleppmann (2017)",
    "origin_source": "Kleppmann, Designing Data-Intensive Applications, Ch. 6; Newman, Building Microservices, Ch. 4 (2nd ed., 2021); Sadalage & Fowler, NoSQL Distilled (2012)",
    "origin_source_zh": "Kleppmann《数据密集型应用系统设计》第6章；Newman《构建微服务》第4章（第二版，2021）；Sadalage和Fowler《NoSQL精粹》（2012）",
    "complexity": "advanced",
    "when_to_use": [
      "When a single database instance cannot handle the write throughput or storage volume required by the application",
      "When query latency is degraded by the sheer size of tables and indexes on a single node, even after vertical scaling",
      "When multi-tenant applications need data isolation guarantees where each tenant's data resides on a dedicated shard",
      "When geographic data locality is required to minimize latency by placing shards close to their primary user base"
    ],
    "when_to_use_zh": [
      "当单个数据库实例无法处理应用所需的写入吞吐量或存储容量时",
      "当即使垂直扩展后，单节点上表和索引的庞大规模导致查询延迟下降时",
      "当多租户应用需要数据隔离保证，每个租户的数据驻留在专用分片上时",
      "当需要地理数据局部性以通过将分片放置在主要用户群附近来最小化延迟时"
    ],
    "core_concepts": [
      "Hash-based sharding: Apply a hash function to the shard key and assign data to shards by hash range; ensures uniform distribution but prevents efficient range queries",
      "Range-based sharding: Assign contiguous key ranges to each shard; supports range queries but risks hot spots if access patterns cluster around certain ranges",
      "Directory-based sharding: A lookup table maps each key to its shard; maximally flexible but introduces the directory as a single point of failure and bottleneck",
      "Cross-shard queries: Queries that span multiple shards require scatter-gather coordination, which is significantly more expensive than single-shard queries",
      "Shard rebalancing: The process of redistributing data across shards when adding or removing nodes, which must be done without downtime using techniques like logical sharding or dual-write migration"
    ],
    "core_concepts_zh": [
      "基于哈希的分片：对分片键应用哈希函数并按哈希范围分配数据到分片；确保均匀分布但阻碍高效的范围查询",
      "基于范围的分片：将连续的键范围分配给每个分片；支持范围查询但如果访问模式集中在某些范围则有热点风险",
      "基于目录的分片：查找表将每个键映射到其分片；灵活性最大但引入目录作为单点故障和瓶颈",
      "跨分片查询：跨越多个分片的查询需要分散-聚合协调，成本远高于单分片查询",
      "分片重新平衡：在添加或移除节点时重新分配数据到分片的过程，必须使用逻辑分片或双写迁移等技术在不停机的情况下完成"
    ],
    "timeline": [
      [
        "1986",
        "DeWitt and Gray publish papers on parallel database systems with horizontal partitioning strategies"
      ],
      [
        "2007",
        "Amazon's Dynamo and Google's Bigtable papers demonstrate sharding at web scale, influencing the NoSQL movement"
      ],
      [
        "2010",
        "MongoDB introduces auto-sharding, making hash-based and range-based sharding accessible to application developers"
      ],
      [
        "2017",
        "Kleppmann's DDIA provides the definitive practitioner guide to partitioning strategies and their trade-offs"
      ],
      [
        "2020",
        "Vitess (YouTube's MySQL sharding middleware) becomes a CNCF graduated project, proving sharding remains essential at scale"
      ]
    ],
    "timeline_zh": [
      [
        "1986",
        "DeWitt和Gray发表关于并行数据库系统水平分区策略的论文"
      ],
      [
        "2007",
        "Amazon的Dynamo和Google的Bigtable论文展示Web级别的分片，影响了NoSQL运动"
      ],
      [
        "2010",
        "MongoDB引入自动分片，使应用开发者能够使用基于哈希和基于范围的分片"
      ],
      [
        "2017",
        "Kleppmann的《数据密集型应用系统设计》提供分区策略及其权衡的权威实践者指南"
      ],
      [
        "2020",
        "Vitess（YouTube的MySQL分片中间件）成为CNCF毕业项目，证明分片在大规模下仍然不可或缺"
      ]
    ],
    "dos": [
      "Do choose a shard key with high cardinality and uniform distribution because low-cardinality keys create unbalanceable hot shards",
      "Do co-locate related data on the same shard whenever possible because cross-shard joins are orders of magnitude slower than local joins",
      "Do implement logical sharding from the start (many logical shards per physical node) because it makes future physical rebalancing a matter of reassigning logical shards rather than splitting data",
      "Do plan for the scatter-gather query pattern because some queries will inevitably span multiple shards and need a coordination layer"
    ],
    "dos_zh": [
      "选择高基数且均匀分布的分片键，因为低基数键会产生无法平衡的热分片",
      "尽可能将相关数据共置在同一分片上，因为跨分片连接比本地连接慢几个数量级",
      "从一开始就实现逻辑分片（每个物理节点多个逻辑分片），因为这使未来的物理重新平衡变成重新分配逻辑分片而非分割数据",
      "为分散-聚合查询模式做好规划，因为某些查询将不可避免地跨越多个分片并需要协调层"
    ],
    "donts": [
      "Don't shard prematurely because sharding adds significant operational complexity; exhaust vertical scaling and read replicas first",
      "Don't choose a shard key based on write patterns alone because read-heavy queries that span all shards will create a scatter-gather bottleneck",
      "Don't forget about secondary indexes because global secondary indexes in a sharded system require cross-shard coordination or eventual consistency",
      "Don't assume you can change the shard key later without a full data migration because the shard key determines the physical data layout"
    ],
    "donts_zh": [
      "不要过早分片，因为分片增加了显著的运维复杂性；先用尽垂直扩展和读副本",
      "不要仅基于写入模式选择分片键，因为跨所有分片的读密集查询会产生分散-聚合瓶颈",
      "不要忘记二级索引，因为分片系统中的全局二级索引需要跨分片协调或最终一致性",
      "不要假设之后可以在不全量数据迁移的情况下更改分片键，因为分片键决定了物理数据布局"
    ],
    "case_study_company": "Slack",
    "case_study": "Slack shards its MySQL databases by workspace (team_id), ensuring that all messages, channels, and metadata for a single workspace reside on the same shard. This design eliminates cross-shard queries for the most common access pattern — loading messages in a channel for a specific workspace. When Slack grew from hundreds to millions of workspaces, they implemented a logical sharding layer called Bedrock that maps workspace IDs to physical shards. When a physical shard becomes too hot, Bedrock moves logical shards to a new physical node with zero-downtime migration using dual-write and cutover. This approach allowed Slack to scale from a single MySQL instance to thousands of shards while maintaining sub-100ms query latencies for workspace-scoped operations.",
    "case_study_zh": "Slack按工作空间（team_id）对MySQL数据库进行分片，确保单个工作空间的所有消息、频道和元数据驻留在同一分片上。这种设计消除了最常见访问模式——加载特定工作空间中频道消息——的跨分片查询。当Slack从数百增长到数百万个工作空间时，他们实现了名为Bedrock的逻辑分片层，将工作空间ID映射到物理分片。当物理分片过热时，Bedrock使用双写和切换将逻辑分片零停机迁移到新的物理节点。这种方法使Slack从单个MySQL实例扩展到数千个分片，同时将工作空间范围操作的查询延迟保持在100毫秒以内。",
    "when_not_to_use": [
      "Datasets that fit comfortably on a single database node with acceptable query performance",
      "Workloads dominated by cross-entity analytical queries where sharding would require constant scatter-gather operations",
      "Early-stage products where the data model is still evolving and premature sharding would lock in a suboptimal partition scheme"
    ],
    "when_not_to_use_zh": [
      "在单个数据库节点上能舒适容纳且查询性能可接受的数据集",
      "以跨实体分析查询为主的工作负载，分片将需要持续的分散-聚合操作",
      "数据模型仍在演进的早期产品，过早分片会锁定次优的分区方案"
    ],
    "adopters": [
      "Slack",
      "Pinterest",
      "Vitess/YouTube",
      "Instagram",
      "MongoDB Atlas"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "scalability",
      "performance"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 6: Partitioning. O'Reilly Media.",
    "secondary_sources": [
      "Sadalage, P.J. & Fowler, M. (2012). \"NoSQL Distilled: A Brief Guide to the Emerging World of Polyglot Persistence\". Addison-Wesley.",
      "Newman, S. (2021). \"Building Microservices\", 2nd ed., Ch. 4. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "consistent-hashing",
        "type": "complement"
      },
      {
        "slug": "cap-theorem",
        "type": "prerequisite"
      },
      {
        "slug": "eventual-consistency",
        "type": "related"
      }
    ]
  },
  {
    "id": 130,
    "name": "Idempotency Pattern",
    "name_zh": "幂等性模式",
    "slug": "idempotency-pattern",
    "category": "distributed",
    "desc": "Design operations to be safely retried without causing duplicate effects",
    "desc_zh": "将操作设计为可安全重试且不产生重复效果",
    "steps": [
      "Assign a unique idempotency key: the client generates a UUID or deterministic key for each logical operation and includes it in every request and retry",
      "Check for prior execution: before processing, the server looks up the idempotency key in a durable store to determine if this operation has already been processed",
      "Execute and store the result: if the key is new, process the operation, persist the result alongside the idempotency key in the same transaction, and return the response",
      "Return the cached result for duplicates: if the key already exists, skip processing and return the previously stored response, ensuring the client sees the same outcome",
      "Expire old keys: implement a TTL-based cleanup policy for idempotency records because storing them indefinitely would consume unbounded storage"
    ],
    "steps_zh": [
      "分配唯一幂等键：客户端为每个逻辑操作生成UUID或确定性键，并在每次请求和重试中包含该键",
      "检查是否已执行：处理前，服务器在持久化存储中查找幂等键以确定该操作是否已被处理",
      "执行并存储结果：如果键是新的，处理操作，在同一事务中将结果与幂等键一起持久化，返回响应",
      "对重复请求返回缓存结果：如果键已存在，跳过处理并返回先前存储的响应，确保客户端看到相同结果",
      "过期旧键：为幂等记录实现基于TTL的清理策略，因为无限期存储会消耗无界的存储空间"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Idempotency Key",
      "Check Prior",
      "Execute and Store",
      "Return Cached",
      "Expire Keys"
    ],
    "viz_labels_zh": [
      "幂等键",
      "重复检查",
      "执行存储",
      "返回缓存",
      "过期清理"
    ],
    "related": [
      "saga-pattern",
      "eventual-consistency",
      "two-phase-commit"
    ],
    "tags": [
      "reliability",
      "retries",
      "deduplication",
      "at-least-once"
    ],
    "origin_author": "Concept from mathematics and HTTP specification (RFC 7231); applied to distributed systems by Helland (2012) and Kleppmann (2017)",
    "origin_source": "Kleppmann, Designing Data-Intensive Applications, Ch. 11; Nygard, Release It!, Ch. 5; Helland, Idempotence Is Not a Medical Condition (ACM Queue, 2012)",
    "origin_source_zh": "Kleppmann《数据密集型应用系统设计》第11章；Nygard《Release It!》第5章；Helland「幂等性不是一种疾病」（ACM Queue，2012）",
    "complexity": "intermediate",
    "when_to_use": [
      "When network failures or timeouts cause clients to retry requests, and duplicate execution would cause data corruption or financial loss",
      "When at-least-once message delivery semantics require deduplication at the consumer to achieve effectively-once processing",
      "When designing payment or order APIs where a duplicate charge or double order creation would be a critical business error",
      "When building webhook receivers that may receive the same event notification multiple times from an external provider"
    ],
    "when_to_use_zh": [
      "当网络故障或超时导致客户端重试请求，且重复执行会导致数据损坏或财务损失时",
      "当至少一次消息投递语义要求消费者进行去重以实现有效的恰好一次处理时",
      "当设计支付或订单API，重复扣款或重复创建订单是严重业务错误时",
      "当构建可能从外部提供者多次收到相同事件通知的webhook接收器时"
    ],
    "core_concepts": [
      "Idempotency key: A client-generated unique identifier (typically a UUID) sent with each request that allows the server to detect and deduplicate retries",
      "At-least-once to effectively-once: Combining at-least-once delivery with server-side idempotency achieves the effect of exactly-once processing without the complexity of true exactly-once protocols",
      "Natural vs. artificial idempotency: Some operations are naturally idempotent (SET x=5) while others require artificial idempotency (tracking that a specific payment was already processed)",
      "Idempotency window: The time period during which the server retains idempotency records; retries after the window has expired may be processed as new operations",
      "Side-effect isolation: External side effects (sending emails, calling third-party APIs) must be guarded separately because the idempotency store only protects the local state change"
    ],
    "core_concepts_zh": [
      "幂等键：客户端生成的唯一标识符（通常是UUID），随每个请求发送，允许服务器检测和去重重试",
      "从至少一次到有效恰好一次：将至少一次投递与服务端幂等性结合，实现恰好一次处理的效果而无需真正恰好一次协议的复杂性",
      "自然幂等与人工幂等：某些操作天然幂等（SET x=5），而其他操作需要人工幂等性（追踪特定支付是否已处理）",
      "幂等窗口：服务器保留幂等记录的时间段；窗口过期后的重试可能被作为新操作处理",
      "副作用隔离：外部副作用（发送邮件、调用第三方API）必须单独保护，因为幂等存储只保护本地状态变更"
    ],
    "timeline": [
      [
        "1999",
        "HTTP/1.1 (RFC 2616) formalizes idempotency for PUT and DELETE methods, establishing the concept in web API design"
      ],
      [
        "2007",
        "Amazon's Dynamo paper discusses idempotent operations as a requirement for safe retries in eventually consistent systems"
      ],
      [
        "2012",
        "Pat Helland publishes Idempotence Is Not a Medical Condition in ACM Queue, advocating for idempotent API design"
      ],
      [
        "2016",
        "Stripe introduces the Idempotency-Key header for payment APIs, setting an industry standard for financial API design"
      ],
      [
        "2017",
        "Kleppmann's DDIA discusses idempotency in the context of stream processing and exactly-once semantics"
      ]
    ],
    "timeline_zh": [
      [
        "1999",
        "HTTP/1.1（RFC 2616）为PUT和DELETE方法形式化幂等性，在Web API设计中确立该概念"
      ],
      [
        "2007",
        "Amazon的Dynamo论文讨论幂等操作作为最终一致系统中安全重试的需求"
      ],
      [
        "2012",
        "Pat Helland在ACM Queue发表「幂等性不是一种疾病」，倡导幂等API设计"
      ],
      [
        "2016",
        "Stripe为支付API引入Idempotency-Key头部，为金融API设计树立行业标准"
      ],
      [
        "2017",
        "Kleppmann的《数据密集型应用系统设计》在流处理和恰好一次语义的背景下讨论幂等性"
      ]
    ],
    "dos": [
      "Do store the idempotency key and result in the same atomic transaction as the business operation because separate stores can become inconsistent after crashes",
      "Do let the client generate the idempotency key because server-generated keys cannot be correlated with retries of the same logical operation",
      "Do return the same HTTP status code and response body for duplicate requests because clients rely on consistent responses to determine success",
      "Do set a reasonable TTL for idempotency records (e.g., 24-72 hours) because indefinite retention creates unbounded storage growth"
    ],
    "dos_zh": [
      "将幂等键和结果与业务操作存储在同一原子事务中，因为分开存储在崩溃后可能变得不一致",
      "让客户端生成幂等键，因为服务端生成的键无法与同一逻辑操作的重试相关联",
      "对重复请求返回相同的HTTP状态码和响应体，因为客户端依赖一致的响应来判断成功",
      "为幂等记录设置合理的TTL（如24-72小时），因为无限期保留会产生无界的存储增长"
    ],
    "donts": [
      "Don't treat all API endpoints as automatically idempotent because POST operations with side effects need explicit idempotency key handling",
      "Don't use timestamps or sequential IDs as idempotency keys because they are not unique across distributed clients and can collide",
      "Don't forget to handle the race condition where two identical requests arrive simultaneously because both may pass the 'key not found' check concurrently",
      "Don't assume idempotency keys protect against different operations because a key is bound to a specific operation — changing the request body with the same key should be rejected"
    ],
    "donts_zh": [
      "不要将所有API端点视为自动幂等的，因为带有副作用的POST操作需要显式的幂等键处理",
      "不要使用时间戳或顺序ID作为幂等键，因为它们在分布式客户端间不唯一且可能冲突",
      "不要忘记处理两个相同请求同时到达的竞态条件，因为两者可能同时通过「键未找到」检查",
      "不要假设幂等键能保护不同的操作，因为键绑定到特定操作——使用相同键但更改请求体应被拒绝"
    ],
    "case_study_company": "Stripe",
    "case_study": "Stripe pioneered the industry-standard approach to API idempotency with their Idempotency-Key header. When a client creates a payment intent, it includes an Idempotency-Key header with a client-generated UUID. Stripe stores this key alongside the payment result in a database. If a network timeout occurs and the client retries with the same key, Stripe returns the original response without charging the customer again. Stripe's implementation handles a subtle edge case: if a retry arrives while the original request is still processing, Stripe returns a 409 Conflict rather than processing it in parallel, preventing race conditions. This pattern has been adopted across the payments industry, with companies like PayPal, Square, and Adyen implementing similar idempotency key mechanisms inspired by Stripe's design.",
    "case_study_zh": "Stripe通过其Idempotency-Key头部开创了API幂等性的行业标准方法。当客户端创建支付意图时，在请求中包含一个带有客户端生成UUID的Idempotency-Key头部。Stripe将此键与支付结果一起存储在数据库中。如果发生网络超时且客户端使用相同键重试，Stripe返回原始响应而不再次向客户收费。Stripe的实现处理了一个微妙的边缘情况：如果在原始请求仍在处理时收到重试，Stripe返回409 Conflict而非并行处理，防止竞态条件。该模式已被支付行业广泛采用，PayPal、Square和Adyen等公司都实现了类似的幂等键机制，灵感源自Stripe的设计。",
    "when_not_to_use": [
      "Read-only GET endpoints that are naturally idempotent and need no special handling",
      "Fire-and-forget telemetry ingestion where occasional duplicates are acceptable and the deduplication overhead is not justified",
      "Internal service calls within a single transaction boundary where the database already provides atomicity guarantees"
    ],
    "when_not_to_use_zh": [
      "天然幂等且不需要特殊处理的只读GET端点",
      "偶尔重复可接受且去重开销不值得的即发即忘遥测数据摄入",
      "单一事务边界内的内部服务调用，数据库已提供原子性保证"
    ],
    "adopters": [
      "Stripe",
      "PayPal",
      "Shopify",
      "Square",
      "AWS (SQS deduplication)"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Helland, P. (2012). \"Idempotence Is Not a Medical Condition\". ACM Queue, 10(4).",
    "secondary_sources": [
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 11. O'Reilly Media.",
      "Nygard, M.T. (2018). \"Release It!\", 2nd ed., Ch. 5. Pragmatic Bookshelf."
    ],
    "typed_relations": [
      {
        "slug": "saga-pattern",
        "type": "complement"
      },
      {
        "slug": "eventual-consistency",
        "type": "complement"
      },
      {
        "slug": "two-phase-commit",
        "type": "complement"
      }
    ]
  },
  {
    "id": 205,
    "name": "CRDT (Conflict-free Replicated Data Types)",
    "name_zh": "无冲突复制数据类型",
    "slug": "crdt",
    "category": "distributed",
    "desc": "Data structures that auto-merge without coordination (Shapiro, 2011)",
    "desc_zh": "无需协调即可自动合并的数据结构（Shapiro，2011）",
    "steps": [
      "Identify the collaborative or distributed data-sharing use case where concurrent modifications by multiple nodes or users must be merged automatically without a coordination round-trip",
      "Select the appropriate CRDT type: G-Counter or PN-Counter for incrementing counters, G-Set or OR-Set for sets, LWW-Element-Set for last-write-wins semantics, or an RGA (Replicated Growable Array) for text collaboration",
      "Implement the CRDT's merge function (join in lattice terms) which must be commutative, associative, and idempotent so that applying the same update multiple times or in any order produces the same result",
      "Design the state or operation propagation strategy: state-based CRDTs (CvRDTs) gossip full state snapshots to peers; operation-based CRDTs (CmRDTs) propagate individual operations and require exactly-once delivery guarantees",
      "Validate convergence under network partition scenarios by simulating concurrent conflicting updates across replicas and verifying that all replicas reach identical state after reconnection, regardless of the order messages were received"
    ],
    "steps_zh": [
      "识别协作或分布式数据共享用例，其中多个节点或用户的并发修改必须自动合并，无需协调往返",
      "选择适当的CRDT类型：G-Counter或PN-Counter用于递增计数器，G-Set或OR-Set用于集合，LWW-Element-Set用于最后写入胜出语义，或RGA（可复制增长数组）用于文本协作",
      "实现CRDT的合并函数（格论中的连接），该函数必须是交换的、结合的和幂等的，使得多次或以任何顺序应用同一更新都产生相同结果",
      "设计状态或操作传播策略：基于状态的CRDT（CvRDT）向对等节点传播完整状态快照；基于操作的CRDT（CmRDT）传播单个操作，并需要精确一次交付保证",
      "通过模拟副本之间的并发冲突更新来验证网络分区场景下的收敛性，验证所有副本在重新连接后达到相同状态，无论消息的接收顺序如何"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Use Case",
      "CRDT Type",
      "Merge Function",
      "Propagation",
      "Convergence Test"
    ],
    "viz_labels_zh": [
      "使用场景",
      "CRDT类型",
      "合并函数",
      "状态传播",
      "收敛验证"
    ],
    "related": [
      "eventual-consistency",
      "gossip-protocol",
      "consensus-protocols",
      "outbox-pattern"
    ],
    "tags": [
      "crdt",
      "eventual-consistency",
      "distributed-state",
      "conflict-resolution",
      "collaboration"
    ],
    "origin_author": "Marc Shapiro, Nuno Preguiça, Carlos Baquero, Marek Zawirski, 2011",
    "origin_source": "Shapiro, M. et al. (2011). \"Conflict-Free Replicated Data Types\". Proceedings of SSS 2011, LNCS 6976. Springer.",
    "origin_source_zh": "Shapiro, M. 等（2011）。「无冲突复制数据类型」。SSS 2011论文集，LNCS 6976，Springer。",
    "complexity": "advanced",
    "when_to_use": [
      "When building real-time collaborative editing applications (documents, spreadsheets, whiteboards) where multiple users concurrently modify shared state",
      "When distributed systems must remain available during network partitions and data must be automatically reconciled upon reconnection without human intervention",
      "When a multi-region active-active database architecture requires conflict resolution that is provably correct without a centralized coordinator",
      "When mobile or offline-capable applications need to merge local changes made during disconnection with server-side changes upon sync"
    ],
    "when_to_use_zh": [
      "构建多用户并发修改共享状态的实时协作编辑应用程序（文档、电子表格、白板）时",
      "当分布式系统在网络分区期间必须保持可用，且数据必须在重新连接时自动协调而无需人工干预时",
      "当多区域主-主数据库架构需要无需集中协调器的可证明正确的冲突解决时",
      "当移动或支持离线的应用程序需要在同步时将断开连接期间进行的本地更改与服务器端更改合并时"
    ],
    "core_concepts": [
      "Monotonic join semi-lattice: the mathematical structure underlying CRDTs where the state space forms a lattice with a least-upper-bound merge operation that is commutative, associative, and idempotent",
      "State-based CRDT (CvRDT): replicas periodically exchange full state; the merge function computes the join of two states; requires only eventual message delivery",
      "Operation-based CRDT (CmRDT): replicas propagate operations that mutate state; requires exactly-once causal delivery but produces smaller network payloads",
      "Strong Eventual Consistency (SEC): the guarantee that all replicas that have received the same set of updates will have identical state, without requiring synchronous coordination"
    ],
    "core_concepts_zh": [
      "单调连接半格：CRDT底层的数学结构，状态空间形成具有最小上界合并操作的格，该操作是交换的、结合的和幂等的",
      "基于状态的CRDT（CvRDT）：副本定期交换完整状态；合并函数计算两个状态的连接；仅需最终消息交付",
      "基于操作的CRDT（CmRDT）：副本传播改变状态的操作；需要精确一次因果交付，但产生更小的网络负载",
      "强最终一致性（SEC）：保证所有收到同一组更新的副本将具有相同状态，无需同步协调"
    ],
    "timeline": [
      [
        "2006",
        "Logoot and other operational transformation (OT) approaches used in collaborative editors precede formal CRDT theory"
      ],
      [
        "2011",
        "Shapiro et al. publish the landmark 'A Comprehensive Study of Convergent and Commutative Replicated Data Types', formalizing CRDT theory"
      ],
      [
        "2014",
        "Riak 2.0 ships with production-ready CRDT support (counters, sets, maps), bringing CRDTs to mainstream distributed databases"
      ],
      [
        "2017",
        "Figma, a collaborative design tool, publicly discusses using CRDTs for multiplayer editing, driving broader awareness in the software industry"
      ]
    ],
    "timeline_zh": [
      [
        "2006",
        "协作编辑器中使用的Logoot和其他操作转换（OT）方法早于正式CRDT理论"
      ],
      [
        "2011",
        "Shapiro等人发表里程碑式的「收敛和可交换复制数据类型综合研究」，正式化CRDT理论"
      ],
      [
        "2014",
        "Riak 2.0发布生产就绪的CRDT支持（计数器、集合、映射），将CRDT带入主流分布式数据库"
      ],
      [
        "2017",
        "协作设计工具Figma公开讨论使用CRDT进行多人编辑，推动软件行业更广泛的认知"
      ]
    ],
    "dos": [
      "Do choose the CRDT type that naturally matches the data semantics because forcing application data into a mismatched CRDT produces correct but semantically wrong results",
      "Do use delta-CRDTs (delta state CRDTs) for large state spaces because shipping full state on every sync generates excessive network bandwidth for large CRDT instances",
      "Do test convergence properties with property-based testing that generates arbitrary sequences of concurrent operations because CRDTs are easy to implement incorrectly in subtle ways",
      "Do consider using established CRDT libraries (Automerge, Yjs, redis-crdt) rather than building from scratch because the theoretical simplicity of CRDTs masks substantial implementation complexity"
    ],
    "dos_zh": [
      "选择与数据语义自然匹配的CRDT类型，因为将应用程序数据强制放入不匹配的CRDT会产生正确但语义错误的结果",
      "对大状态空间使用增量CRDT（增量状态CRDT），因为每次同步都传送完整状态会为大型CRDT实例产生过多网络带宽",
      "使用生成任意并发操作序列的基于属性的测试来测试收敛特性，因为CRDT很容易以微妙的方式实现不正确",
      "考虑使用成熟的CRDT库（Automerge、Yjs、redis-crdt）而非从头构建，因为CRDT的理论简单性掩盖了大量的实现复杂性"
    ],
    "donts": [
      "Don't use CRDTs for data that requires strict consistency (financial balances, inventory counts where overselling is unacceptable) because CRDTs guarantee eventual consistency, not strong consistency",
      "Don't model all application state as CRDTs because not every field needs conflict-free merging and the overhead is unnecessary for state that is never concurrently modified",
      "Don't confuse operational transformation (OT) with CRDTs because they solve the same collaboration problem with different guarantees — CRDTs are more composable but OT can produce better user intent preservation",
      "Don't ignore tombstone growth in OR-Sets and similar CRDTs because deleted elements leave tombstones that accumulate indefinitely without periodic garbage collection"
    ],
    "donts_zh": [
      "不要将CRDT用于需要强一致性的数据（财务余额、超卖不可接受的库存计数），因为CRDT保证最终一致性，而非强一致性",
      "不要将所有应用程序状态建模为CRDT，因为并非每个字段都需要无冲突合并，对于从未被并发修改的状态，开销是不必要的",
      "不要将操作转换（OT）与CRDT混淆，因为它们用不同的保证解决相同的协作问题——CRDT更可组合，但OT可以更好地保留用户意图",
      "不要忽视OR-Sets和类似CRDT中的墓碑增长，因为删除的元素会留下无限期积累的墓碑，没有定期垃圾收集"
    ],
    "case_study_company": "Figma",
    "case_study": "Figma uses a custom CRDT-inspired data model for its multiplayer design editor, allowing dozens of designers to simultaneously edit the same design file without merge conflicts. Each design element (frame, shape, text) is represented as a node in a document tree, with properties stored as last-write-wins registers. Operations (move, resize, recolor) are sent as small deltas to a central relay server that broadcasts them to all connected clients. Figma's multiplayer system handles over 1 million concurrent collaborative sessions daily with sub-100ms operation propagation latency, making real-time collaboration indistinguishable from working locally.",
    "case_study_zh": "Figma为其多人设计编辑器使用自定义的CRDT启发数据模型，允许数十名设计师同时编辑同一设计文件而不产生合并冲突。每个设计元素（框架、形状、文本）表示为文档树中的节点，属性存储为最后写入胜出寄存器。操作（移动、调整大小、重新着色）作为小增量发送到中央中继服务器，该服务器将其广播给所有连接的客户端。Figma的多人系统每天处理超过100万个并发协作会话，操作传播延迟低于100毫秒，使实时协作与本地工作无异。",
    "when_not_to_use": [
      "Financial or inventory systems where preventing double-spending or overselling requires strong consistency guarantees that CRDTs cannot provide",
      "Simple leader-follower replication where a single primary handles all writes and conflict-free merging is unnecessary",
      "When the team lacks the mathematical background to correctly implement and test CRDT merge functions — incorrect implementations create subtle data loss bugs",
      "Systems with low concurrency where optimistic locking or simple last-write-wins semantics with human conflict resolution is sufficient"
    ],
    "when_not_to_use_zh": [
      "防止双重支出或超卖需要CRDT无法提供的强一致性保证的金融或库存系统",
      "单个主节点处理所有写入且无冲突合并不必要的简单主从复制",
      "当团队缺乏正确实现和测试CRDT合并函数的数学背景时——不正确的实现会产生微妙的数据丢失错误",
      "并发性低的系统，乐观锁定或带有人工冲突解决的简单最后写入胜出语义已足够"
    ],
    "adopters": [
      "Figma",
      "Notion",
      "Apple (CloudKit)",
      "Redis (CRDT for Redis Enterprise)",
      "Riak (Basho)"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability",
      "scalability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Shapiro, M., Preguiça, N., Baquero, C. & Zawirski, M. (2011). \"Conflict-Free Replicated Data Types\". Proceedings of SSS 2011, LNCS 6976. Springer.",
    "secondary_sources": [
      "Shapiro, M. et al. (2011). \"A Comprehensive Study of Convergent and Commutative Replicated Data Types\". INRIA Research Report RR-7506.",
      "Kleppmann, M. & Beresford, A.R. (2017). \"A Conflict-Free Replicated JSON Datatype\". IEEE Transactions on Parallel and Distributed Systems.",
      "Kleppmann, M. (2019). \"Designing Data-Intensive Applications\", Ch. 5. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "eventual-consistency",
        "type": "extends"
      },
      {
        "slug": "gossip-protocol",
        "type": "complement"
      },
      {
        "slug": "consensus-protocols",
        "type": "alternative"
      },
      {
        "slug": "outbox-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 206,
    "name": "Outbox Pattern",
    "name_zh": "发件箱模式",
    "slug": "outbox-pattern",
    "category": "distributed",
    "desc": "Reliable event publishing from database transactions",
    "desc_zh": "从数据库事务中可靠地发布事件",
    "steps": [
      "Add an outbox table to the service's database schema alongside the primary business entities; the table stores pending domain events with columns for event type, payload, destination topic, and publication status",
      "Modify the service's transactional write path to insert domain events into the outbox table within the same database transaction as the business entity mutation, guaranteeing atomicity between state change and event recording",
      "Deploy a message relay process (Debezium CDC connector, polling relay, or Transactional Outbox relay) that reads unpublished events from the outbox table and publishes them to the message broker (Kafka, SQS, RabbitMQ)",
      "Mark events as published after successful broker acknowledgment; implement at-least-once delivery semantics since the relay may publish an event, crash before marking it, and republish on restart",
      "Consumers must be idempotent to handle duplicate events produced by the at-least-once relay; use deduplication keys or idempotent operations to ensure duplicate publications have no additional effect"
    ],
    "steps_zh": [
      "在服务的数据库模式中，在主要业务实体旁边添加发件箱表；该表存储待处理的领域事件，包含事件类型、负载、目标主题和发布状态的列",
      "修改服务的事务写入路径，在与业务实体变更相同的数据库事务中将领域事件插入发件箱表，保证状态变更和事件记录之间的原子性",
      "部署消息中继进程（Debezium CDC连接器、轮询中继或事务性发件箱中继），从发件箱表读取未发布的事件并将其发布到消息代理（Kafka、SQS、RabbitMQ）",
      "成功获得代理确认后将事件标记为已发布；实施至少一次交付语义，因为中继可能发布事件、在标记前崩溃，并在重启时重新发布",
      "消费者必须是幂等的以处理至少一次中继产生的重复事件；使用去重键或幂等操作确保重复发布没有额外影响"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Outbox Table",
      "Atomic Write",
      "Message Relay",
      "At-Least-Once",
      "Idempotent Consumer"
    ],
    "viz_labels_zh": [
      "发件箱表",
      "原子写入",
      "消息中继",
      "至少一次",
      "幂等消费"
    ],
    "related": [
      "change-data-capture",
      "saga-pattern",
      "idempotency-pattern",
      "crdt"
    ],
    "tags": [
      "outbox",
      "reliable-messaging",
      "transactional-outbox",
      "event-driven",
      "at-least-once"
    ],
    "origin_author": "Chris Richardson (microservices.io); popularized by Udi Dahan and Greg Young in CQRS/event sourcing contexts",
    "origin_source": "Richardson, C. (2018). \"Microservices Patterns\". Manning; microservices.io/patterns/data/transactional-outbox.html",
    "origin_source_zh": "Richardson, C.（2018）。《微服务模式》，Manning；microservices.io/patterns/data/transactional-outbox.html",
    "complexity": "intermediate",
    "when_to_use": [
      "When a microservice must atomically update its database and publish a domain event without using distributed transactions, preventing the dual-write problem",
      "When message broker publish failures (network partition, broker unavailability) must not cause data inconsistencies between the service database and the event log",
      "When event-driven architectures require guaranteed at-least-once event delivery even if the publishing service crashes immediately after committing a transaction",
      "When implementing the Saga pattern where each saga step must reliably publish the event that triggers the next step in the compensating transaction chain"
    ],
    "when_to_use_zh": [
      "当微服务必须在不使用分布式事务的情况下原子性地更新其数据库并发布领域事件，防止双写问题时",
      "当消息代理发布失败（网络分区、代理不可用）不得导致服务数据库和事件日志之间数据不一致时",
      "当事件驱动架构需要保证至少一次事件交付，即使发布服务在提交事务后立即崩溃时",
      "当实现Saga模式时，每个Saga步骤必须可靠地发布触发补偿事务链中下一步的事件"
    ],
    "core_concepts": [
      "Dual-write problem: the impossibility of atomically updating a database and publishing to a message broker in two separate operations — one will succeed and the other can fail, creating inconsistency",
      "Transactional outbox: a staging table within the same database that acts as a durable event queue, allowing a single database transaction to atomically commit both the business state and the event record",
      "Message relay: a background process that polls or tails the outbox table and publishes events to the broker, decoupling the business transaction from the publish operation",
      "At-least-once delivery: the guarantee that every event in the outbox will eventually be published to the broker, potentially more than once; consumers must handle duplicates"
    ],
    "core_concepts_zh": [
      "双写问题：在两个独立操作中原子性地更新数据库和发布到消息代理的不可能性——一个会成功而另一个可能失败，造成不一致",
      "事务性发件箱：同一数据库中作为持久事件队列的暂存表，允许单个数据库事务原子性地提交业务状态和事件记录",
      "消息中继：轮询或跟踪发件箱表并将事件发布到代理的后台进程，将业务事务与发布操作解耦",
      "至少一次交付：保证发件箱中的每个事件最终将发布到代理（可能不止一次）；消费者必须处理重复项"
    ],
    "timeline": [
      [
        "2010",
        "Udi Dahan and Greg Young discuss the dual-write problem and outbox as part of NServiceBus and CQRS/event sourcing patterns"
      ],
      [
        "2015",
        "Debezium project released by Red Hat, providing a CDC-based implementation of the outbox relay using database transaction logs"
      ],
      [
        "2018",
        "Chris Richardson formalizes the Transactional Outbox pattern in 'Microservices Patterns' and on microservices.io, giving it a canonical name and reference implementation"
      ],
      [
        "2020",
        "Debezium's Outbox Event Router SMT (Single Message Transform) provides a turnkey outbox implementation without polling, using log-based CDC"
      ]
    ],
    "timeline_zh": [
      [
        "2010",
        "Udi Dahan和Greg Young在NServiceBus和CQRS/事件溯源模式的背景下讨论双写问题和发件箱"
      ],
      [
        "2015",
        "Red Hat发布Debezium项目，提供使用数据库事务日志的基于CDC的发件箱中继实现"
      ],
      [
        "2018",
        "Chris Richardson在《微服务模式》和microservices.io中正式化事务性发件箱模式，为其提供规范名称和参考实现"
      ],
      [
        "2020",
        "Debezium的发件箱事件路由器SMT（单消息转换）提供无需轮询的即用型发件箱实现，使用基于日志的CDC"
      ]
    ],
    "dos": [
      "Do use CDC-based outbox relay (Debezium) rather than polling where possible because CDC has lower database load and sub-second relay latency compared to polling intervals of seconds or minutes",
      "Do include a correlation ID in every outbox event so that distributed traces can be reconstructed across service boundaries from the originating request to the eventual event consumer",
      "Do partition the outbox table by service or aggregate type to prevent a high-volume event stream from a single aggregate from blocking events of other types",
      "Do set a maximum event age and alert when events remain unpublished beyond it because stuck events in the outbox indicate a relay failure that may cause business-critical delays"
    ],
    "dos_zh": [
      "尽可能使用基于CDC的发件箱中继（Debezium）而非轮询，因为与数秒或数分钟的轮询间隔相比，CDC具有更低的数据库负载和亚秒中继延迟",
      "在每个发件箱事件中包含关联ID，使分布式追踪可以跨服务边界从原始请求到最终事件消费者进行重建",
      "按服务或聚合类型对发件箱表进行分区，以防止单个聚合的大量事件流阻塞其他类型的事件",
      "设置最大事件年龄并在事件超出该时间未发布时发出告警，因为发件箱中的卡住事件表明可能导致业务关键延迟的中继失败"
    ],
    "donts": [
      "Don't use the outbox as a long-term event store because it is a short-lived staging area — events should be published within seconds and the outbox should be trimmed regularly",
      "Don't skip consumer idempotency because at-least-once delivery is a fundamental outbox guarantee and consumers that process duplicates non-idempotently will produce incorrect results",
      "Don't implement the outbox in a separate database from the business entity because the entire point of the pattern is to use a single database transaction to atomically write both",
      "Don't forget to monitor outbox table growth because an accumulating backlog indicates a relay outage or broker connectivity issue that requires immediate attention"
    ],
    "donts_zh": [
      "不要将发件箱用作长期事件存储，因为它是短期暂存区——事件应在几秒内发布，发件箱应定期清理",
      "不要跳过消费者幂等性，因为至少一次交付是发件箱的基本保证，非幂等处理重复项的消费者将产生不正确的结果",
      "不要在与业务实体不同的数据库中实现发件箱，因为该模式的整个意义在于使用单个数据库事务原子性地写入两者",
      "不要忘记监控发件箱表增长，因为积累的积压表明中继中断或代理连接问题，需要立即关注"
    ],
    "case_study_company": "Uber",
    "case_study": "Uber uses the Outbox Pattern across many of its microservices to ensure reliable event publishing without distributed transactions. In their trip dispatch service, when a driver accepts a trip, the service atomically updates the trip state in its PostgreSQL database and inserts a TripAccepted domain event into the outbox table in the same transaction. A Debezium CDC relay reads the outbox events from the PostgreSQL WAL and publishes them to Apache Kafka topics consumed by the rider notification service, pricing engine, and analytics pipeline. This design ensures that a TripAccepted event is always published exactly once — even if the trip service crashes between committing the trip state and publishing the event — eliminating ghost trips and missed notifications.",
    "case_study_zh": "Uber在其许多微服务中使用发件箱模式，以确保在没有分布式事务的情况下可靠地发布事件。在其行程调度服务中，当司机接受行程时，服务在同一事务中原子性地更新其PostgreSQL数据库中的行程状态，并将TripAccepted领域事件插入发件箱表。Debezium CDC中继从PostgreSQL WAL读取发件箱事件，并将其发布到乘客通知服务、定价引擎和分析管道消费的Apache Kafka主题。这种设计确保TripAccepted事件始终被发布一次——即使行程服务在提交行程状态和发布事件之间崩溃——消除了幽灵行程和错过的通知。",
    "when_not_to_use": [
      "When the message broker and database can participate in an XA distributed transaction and the operational overhead of XA is acceptable",
      "Simple single-service applications without downstream event consumers where the overhead of maintaining an outbox table is unnecessary",
      "When event loss is acceptable (fire-and-forget telemetry, non-critical notifications) and the complexity of the outbox pattern is not justified by the reliability requirement",
      "Greenfield services where an event streaming architecture with event sourcing (Kafka as the system of record) eliminates the dual-write problem entirely at the architecture level"
    ],
    "when_not_to_use_zh": [
      "当消息代理和数据库可以参与XA分布式事务，且XA的运维开销可以接受时",
      "没有下游事件消费者的简单单服务应用程序，维护发件箱表的开销是不必要的",
      "当事件丢失可以接受（即发即忘遥测、非关键通知）且发件箱模式的复杂性不被可靠性要求所证明时",
      "采用事件溯源的事件流架构（Kafka作为记录系统）在架构级别完全消除双写问题的绿地服务"
    ],
    "adopters": [
      "Uber",
      "Netflix",
      "Shopify",
      "Red Hat (Debezium)",
      "Eventuate.io"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Richardson, C. (2018). \"Microservices Patterns\", Ch. 3. Manning Publications.",
    "secondary_sources": [
      "Richardson, C. (2018). \"Pattern: Transactional outbox\". microservices.io/patterns/data/transactional-outbox.html.",
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 11. O'Reilly Media.",
      "Debezium Project (2020). \"Outbox Event Router Transformation\". debezium.io/documentation."
    ],
    "typed_relations": [
      {
        "slug": "change-data-capture",
        "type": "complement"
      },
      {
        "slug": "saga-pattern",
        "type": "complement"
      },
      {
        "slug": "idempotency-pattern",
        "type": "complement"
      },
      {
        "slug": "crdt",
        "type": "related"
      }
    ]
  },
  {
    "id": 207,
    "name": "Service Discovery",
    "name_zh": "服务发现",
    "slug": "service-discovery",
    "category": "distributed",
    "desc": "Dynamic registration and lookup of service instances (Consul, etcd)",
    "desc_zh": "服务实例的动态注册与查找（Consul、etcd）",
    "steps": [
      "Choose a service registry (Consul, etcd, ZooKeeper, Eureka, or Kubernetes Service + CoreDNS) and deploy it as a highly available cluster with at least 3 nodes to survive node failures without losing the registry",
      "Implement service registration: when a service instance starts, it registers its network address (IP, port, protocol) and health check endpoint with the registry; on shutdown, it deregisters to remove stale entries",
      "Configure health checks in the registry so that the registry actively monitors instance health and automatically removes instances that fail checks, preventing traffic from routing to unhealthy endpoints",
      "Implement service lookup in client services using either client-side discovery (the client queries the registry and load-balances across healthy instances) or server-side discovery (the client sends requests to a load balancer or service mesh proxy that performs registry lookup)",
      "Test failure scenarios: kill a service instance and verify that the registry removes it within the health check timeout and that clients stop routing traffic to the dead instance within the acceptable failover window"
    ],
    "steps_zh": [
      "选择服务注册表（Consul、etcd、ZooKeeper、Eureka或Kubernetes Service + CoreDNS）并将其部署为至少3个节点的高可用集群，以在节点故障后存活而不丢失注册表",
      "实施服务注册：当服务实例启动时，向注册表注册其网络地址（IP、端口、协议）和健康检查端点；关闭时注销以删除过期条目",
      "在注册表中配置健康检查，使注册表主动监控实例健康状况，并自动删除未通过检查的实例，防止流量路由到不健康的端点",
      "在客户端服务中使用客户端发现（客户端查询注册表并在健康实例间负载均衡）或服务器端发现（客户端向执行注册表查找的负载均衡器或服务网格代理发送请求）来实施服务查找",
      "测试故障场景：终止服务实例并验证注册表在健康检查超时内删除它，以及客户端在可接受的故障转移窗口内停止路由流量到死亡实例"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Service Registry",
      "Registration",
      "Health Checks",
      "Service Lookup",
      "Failover Test"
    ],
    "viz_labels_zh": [
      "服务注册表",
      "服务注册",
      "健康检查",
      "服务发现",
      "故障切换"
    ],
    "related": [
      "consensus-protocols",
      "leader-election",
      "circuit-breaker-with-retry",
      "bulkhead-service-level"
    ],
    "tags": [
      "service-discovery",
      "consul",
      "etcd",
      "kubernetes",
      "load-balancing"
    ],
    "origin_author": "Netflix (Eureka, 2012); HashiCorp (Consul, 2014); Kubernetes (CoreDNS-based, 2015)",
    "origin_source": "Richardson, C. (2018). \"Microservices Patterns\", Ch. 3. Manning; consul.io documentation; Kubernetes Service documentation",
    "origin_source_zh": "Richardson, C.（2018）。《微服务模式》第3章，Manning；consul.io文档；Kubernetes Service文档",
    "complexity": "intermediate",
    "when_to_use": [
      "When microservices are deployed on dynamic infrastructure where IP addresses change on every deployment, container restart, or auto-scaling event",
      "When a service mesh is not in use and services must directly discover the network locations of their dependencies without hard-coded configuration",
      "When the number of service instances varies dynamically based on load and clients must route only to currently healthy instances",
      "When multi-datacenter or multi-cloud deployments require services to discover peers across network segments with different address spaces"
    ],
    "when_to_use_zh": [
      "当微服务部署在每次部署、容器重启或自动扩展事件都会更改IP地址的动态基础设施上时",
      "当没有使用服务网格，服务必须直接发现其依赖项的网络位置而不使用硬编码配置时",
      "当服务实例数量根据负载动态变化，客户端必须仅路由到当前健康的实例时",
      "当多数据中心或多云部署要求服务在具有不同地址空间的网络段之间发现对等节点时"
    ],
    "core_concepts": [
      "Service registry: the centralized (or distributed) store that maintains the mapping from service name to healthy instance network addresses",
      "Client-side discovery: the calling service queries the registry directly, selects an instance using a load-balancing algorithm (round-robin, least-connections), and makes the request directly to the chosen instance",
      "Server-side discovery: the calling service sends requests to a stable proxy (load balancer, API gateway, or service mesh sidecar) that performs registry lookup and routing, hiding infrastructure dynamism from the application",
      "Health check: a mechanism by which the registry (or a sidecar) periodically probes service instances (HTTP endpoint, TCP ping, or script) and removes those that fail"
    ],
    "core_concepts_zh": [
      "服务注册表：维护服务名称到健康实例网络地址映射的集中（或分布式）存储",
      "客户端发现：调用服务直接查询注册表，使用负载均衡算法（轮询、最少连接）选择实例，并直接向所选实例发送请求",
      "服务器端发现：调用服务向稳定代理（负载均衡器、API网关或服务网格Sidecar）发送请求，该代理执行注册表查找和路由，向应用程序隐藏基础设施动态性",
      "健康检查：注册表（或Sidecar）定期探测服务实例（HTTP端点、TCP ping或脚本）并删除失败实例的机制"
    ],
    "timeline": [
      [
        "2012",
        "Netflix open-sources Eureka, one of the first widely-used service registries built for cloud-native microservices at scale"
      ],
      [
        "2014",
        "HashiCorp releases Consul with built-in service discovery, health checking, key-value store, and multi-datacenter support"
      ],
      [
        "2015",
        "Kubernetes Services and CoreDNS provide DNS-based service discovery as a first-class primitive in the container orchestration platform"
      ],
      [
        "2018",
        "Service meshes (Istio, Linkerd) supersede explicit service discovery for many use cases by embedding discovery in the sidecar proxy"
      ]
    ],
    "timeline_zh": [
      [
        "2012",
        "Netflix开源Eureka，这是最早为大规模云原生微服务构建的广泛使用的服务注册表之一"
      ],
      [
        "2014",
        "HashiCorp发布Consul，内置服务发现、健康检查、键值存储和多数据中心支持"
      ],
      [
        "2015",
        "Kubernetes Services和CoreDNS在容器编排平台中提供基于DNS的服务发现作为一等原语"
      ],
      [
        "2018",
        "服务网格（Istio、Linkerd）通过将发现嵌入Sidecar代理，在许多用例中取代了显式服务发现"
      ]
    ],
    "dos": [
      "Do deploy the service registry as a clustered, highly available service with at least 3 nodes because the registry is a critical infrastructure dependency that every service relies on",
      "Do implement graceful deregistration in shutdown hooks so that instances remove themselves from the registry before stopping, minimizing the window where the registry routes to a dead instance",
      "Do tune health check intervals and failure thresholds to balance detection speed against false-positive deregistrations because overly sensitive health checks deregister healthy instances under load",
      "Do cache registry lookups with a short TTL in client services because querying the registry on every request creates a bottleneck at the registry and adds latency to every service call"
    ],
    "dos_zh": [
      "将服务注册表部署为至少3个节点的集群高可用服务，因为注册表是每个服务都依赖的关键基础设施依赖项",
      "在关闭钩子中实施优雅注销，使实例在停止前从注册表中删除自身，最小化注册表路由到死亡实例的窗口",
      "调整健康检查间隔和失败阈值以平衡检测速度与误报注销，因为过于敏感的健康检查会在负载下注销健康实例",
      "在客户端服务中以短TTL缓存注册表查找，因为每次请求都查询注册表会在注册表处产生瓶颈，并为每次服务调用增加延迟"
    ],
    "donts": [
      "Don't hard-code service instance addresses in configuration files or environment variables because they change with every deployment and become stale immediately",
      "Don't skip health checks on registered services because the registry will route traffic to instances that are up but serving errors if health checks are absent",
      "Don't make the service registry a single point of failure by running it as a single instance — a registry outage means no service can discover any other service",
      "Don't use service discovery for databases or stateful services that require sticky connections because round-robin instance selection breaks connection pooling and stateful session affinity"
    ],
    "donts_zh": [
      "不要在配置文件或环境变量中硬编码服务实例地址，因为它们在每次部署时都会更改，立即变得过时",
      "不要跳过已注册服务的健康检查，因为如果没有健康检查，注册表将向运行中但提供错误服务的实例路由流量",
      "不要通过将服务注册表作为单实例运行使其成为单点故障——注册表中断意味着没有服务可以发现任何其他服务",
      "不要将服务发现用于需要粘性连接的数据库或有状态服务，因为轮询实例选择会破坏连接池和有状态会话亲和性"
    ],
    "case_study_company": "Airbnb",
    "case_study": "Airbnb runs a service discovery infrastructure built on Consul, managing over 5,000 service instances across multiple AWS availability zones. When an Airbnb service deploys a new version, the old instances deregister and new instances register with Consul automatically via their deployment system. Consul's health checks probe each instance's /health endpoint every 10 seconds, removing failed instances within 30 seconds. Client services use a local Consul agent for cached lookups, making service resolution sub-millisecond. During Airbnb's 2020 AWS availability zone failures, Consul's multi-datacenter federation automatically shifted traffic to healthy availability zones with under 2 minutes of detection-to-failover latency.",
    "case_study_zh": "Airbnb运行基于Consul构建的服务发现基础设施，管理跨多个AWS可用区的5,000多个服务实例。当Airbnb服务部署新版本时，旧实例注销，新实例通过其部署系统自动向Consul注册。Consul的健康检查每10秒探测每个实例的/health端点，在30秒内删除失败的实例。客户端服务使用本地Consul代理进行缓存查找，使服务解析达到亚毫秒级。在Airbnb 2020年AWS可用区故障期间，Consul的多数据中心联合自动将流量转移到健康的可用区，检测到故障转移的延迟不到2分钟。",
    "when_not_to_use": [
      "Monolithic applications where all components run in the same process or on a fixed set of known servers with static IP addresses",
      "Small deployments with a fixed number of service instances where a static load balancer configuration is sufficient and the overhead of a service registry is not justified",
      "When a service mesh (Istio, Linkerd) is already in use because the mesh sidecar handles service discovery transparently without a separate registry client in the application",
      "Serverless or FaaS architectures where the platform manages instance routing and developers do not need to implement service discovery"
    ],
    "when_not_to_use_zh": [
      "所有组件在同一进程中运行或在具有静态IP地址的固定已知服务器集上运行的单体应用程序",
      "具有固定服务实例数量的小型部署，静态负载均衡器配置已足够且服务注册表的开销不合理",
      "当服务网格（Istio、Linkerd）已在使用时，因为网格Sidecar透明地处理服务发现，应用程序中不需要单独的注册表客户端",
      "平台管理实例路由且开发者不需要实现服务发现的无服务器或FaaS架构"
    ],
    "adopters": [
      "Airbnb",
      "Lyft",
      "Shopify",
      "HashiCorp customers",
      "Kubernetes ecosystem"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Richardson, C. (2018). \"Microservices Patterns\", Ch. 3. Manning Publications.",
    "secondary_sources": [
      "Newman, S. (2021). \"Building Microservices\", 2nd ed., Ch. 10. O'Reilly Media.",
      "HashiCorp (2014). \"Consul Documentation\". consul.io.",
      "Kubernetes Project (2015). \"Kubernetes Services Documentation\". kubernetes.io/docs/concepts/services-networking/service."
    ],
    "typed_relations": [
      {
        "slug": "consensus-protocols",
        "type": "complement"
      },
      {
        "slug": "leader-election",
        "type": "complement"
      },
      {
        "slug": "circuit-breaker-with-retry",
        "type": "complement"
      },
      {
        "slug": "bulkhead-service-level",
        "type": "complement"
      }
    ]
  },
  {
    "id": 208,
    "name": "Circuit Breaker with Retry",
    "name_zh": "带重试的断路器",
    "slug": "circuit-breaker-with-retry",
    "category": "distributed",
    "desc": "Combined retry + circuit breaker for resilient communication",
    "desc_zh": "结合重试与断路器实现弹性通信",
    "steps": [
      "Instrument all inter-service calls with a retry policy: configure a maximum retry count (e.g., 3 attempts), a retry delay strategy (fixed, exponential backoff, or jitter), and the specific exception types that should trigger a retry versus those that should fail immediately",
      "Wrap the retry-capable call with a circuit breaker: configure a failure threshold (e.g., 50% failure rate over 30 seconds) that trips the breaker to open state, a timeout for the half-open probe attempt, and the success count required to close the breaker",
      "Ensure the circuit breaker wraps the entire retry sequence, not each individual attempt: the breaker tracks failures of the final result after all retries are exhausted, not intermediate retry failures",
      "Implement a fallback strategy for open circuit state: return a cached value, a degraded default response, or propagate a clear error to the caller indicating the dependency is unavailable",
      "Monitor the circuit breaker state, failure rate, and retry exhaustion metrics in a dashboard; alert when a circuit enters open state for more than a configurable threshold because open circuits indicate dependency health problems requiring investigation"
    ],
    "steps_zh": [
      "用重试策略对所有服务间调用进行埋点：配置最大重试次数（如3次尝试）、重试延迟策略（固定、指数退避或抖动）以及应触发重试的特定异常类型与应立即失败的类型",
      "用断路器包装具有重试能力的调用：配置使断路器跳闸为打开状态的失败阈值（如30秒内50%失败率）、半开探测尝试的超时和关闭断路器所需的成功次数",
      "确保断路器包装整个重试序列，而非每次单独尝试：断路器在所有重试耗尽后追踪最终结果的失败，而非中间重试失败",
      "为断路器打开状态实施回退策略：返回缓存值、降级默认响应，或向调用方传播明确的错误，指示依赖项不可用",
      "在仪表板中监控断路器状态、失败率和重试耗尽指标；当断路器进入打开状态超过可配置阈值时发出告警，因为打开的断路器表明需要调查的依赖项健康问题"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Retry Policy",
      "Circuit Breaker",
      "Wrap Retry",
      "Fallback",
      "Monitor"
    ],
    "viz_labels_zh": [
      "重试策略",
      "熔断器",
      "包装重试",
      "降级策略",
      "状态监控"
    ],
    "related": [
      "bulkhead-service-level",
      "service-discovery",
      "idempotency-pattern"
    ],
    "tags": [
      "circuit-breaker",
      "retry",
      "resilience",
      "fault-tolerance",
      "hystrix"
    ],
    "origin_author": "Michael Nygard (circuit breaker pattern, 2007); Polly (.NET) and Hystrix (Netflix, 2012) as implementations",
    "origin_source": "Nygard, M.T. (2007/2018). \"Release It!\", 2nd ed. Pragmatic Bookshelf; Netflix Hystrix documentation (2012); Fowler, M. (2014). \"CircuitBreaker\". martinfowler.com",
    "origin_source_zh": "Nygard, M.T.（2007/2018）。《Release It!》第2版，Pragmatic Bookshelf；Netflix Hystrix文档（2012）；Fowler, M.（2014）。「断路器」，martinfowler.com",
    "complexity": "intermediate",
    "when_to_use": [
      "When a microservice makes synchronous HTTP or gRPC calls to downstream dependencies that may be temporarily unavailable, slow, or returning errors intermittently",
      "When retry storms from many callers simultaneously retrying a failing service would overwhelm the dependency rather than allowing it to recover",
      "When graceful degradation is required so that a failed dependency causes the calling service to return a degraded response rather than failing entirely",
      "When a critical downstream service is known to experience periodic maintenance windows or traffic-related slowdowns that should not cascade into the calling service"
    ],
    "when_to_use_zh": [
      "当微服务对可能暂时不可用、缓慢或间歇性返回错误的下游依赖项进行同步HTTP或gRPC调用时",
      "当来自许多调用方同时重试失败服务的重试风暴会压垮依赖项而不允许其恢复时",
      "当需要优雅降级使失败的依赖项导致调用服务返回降级响应而非完全失败时",
      "当已知关键下游服务会经历定期维护窗口或与流量相关的减速，不应级联到调用服务时"
    ],
    "core_concepts": [
      "Closed state: the circuit is functioning normally; requests flow through; failures are counted against the threshold",
      "Open state: the failure threshold has been breached; all requests fail immediately without attempting the downstream call, preventing further load on an already-struggling dependency",
      "Half-open state: after a configured timeout, the circuit allows a single probe request through; if it succeeds the circuit closes; if it fails the circuit reopens for another timeout period",
      "Exponential backoff with jitter: retry delays that grow exponentially with each attempt and add random jitter to prevent thundering herd — many callers retrying simultaneously at the same interval"
    ],
    "core_concepts_zh": [
      "关闭状态：断路器正常运行；请求通过；失败被计入阈值",
      "打开状态：失败阈值已被突破；所有请求立即失败而不尝试下游调用，防止对已经困难的依赖项进一步加载",
      "半开状态：在配置的超时后，断路器允许单个探测请求通过；如果成功则断路器关闭；如果失败则断路器重新打开另一个超时周期",
      "带抖动的指数退避：每次尝试指数增长并添加随机抖动以防止雷鸣群——许多调用方在同一间隔同时重试"
    ],
    "timeline": [
      [
        "2007",
        "Michael Nygard introduces the circuit breaker pattern in 'Release It!' as a stability pattern for preventing cascading failures"
      ],
      [
        "2012",
        "Netflix open-sources Hystrix, the first widely-adopted circuit breaker library for JVM services in production-scale microservices"
      ],
      [
        "2016",
        "Polly (.NET) and Resilience4j (Java) emerge as framework-agnostic circuit breaker libraries; Hystrix enters maintenance mode"
      ],
      [
        "2020",
        "Service meshes (Istio, Linkerd) implement circuit breaking at the infrastructure layer via sidecar proxies, making it available without code changes"
      ]
    ],
    "timeline_zh": [
      [
        "2007",
        "Michael Nygard在《Release It!》中引入断路器模式，作为防止级联故障的稳定性模式"
      ],
      [
        "2012",
        "Netflix开源Hystrix，这是第一个在生产规模微服务中广泛采用的JVM服务断路器库"
      ],
      [
        "2016",
        "Polly（.NET）和Resilience4j（Java）作为框架无关的断路器库出现；Hystrix进入维护模式"
      ],
      [
        "2020",
        "服务网格（Istio、Linkerd）通过Sidecar代理在基础设施层实现断路，使其无需代码更改即可使用"
      ]
    ],
    "dos": [
      "Do configure separate circuit breakers for each downstream dependency because a shared breaker allows one failing dependency to block calls to all other healthy dependencies",
      "Do use jitter in retry delays because without jitter all retrying clients synchronize their retry timing and create a thundering herd that can overwhelm a recovering service",
      "Do set the circuit breaker timeout based on the dependency's expected recovery time rather than an arbitrary default because too-short timeouts cause premature circuit closure before the dependency has recovered",
      "Do test the circuit breaker behavior under load with chaos engineering tools (Chaos Monkey, Gremlin) because untested circuit breakers fail in unexpected ways under production conditions"
    ],
    "dos_zh": [
      "为每个下游依赖项配置单独的断路器，因为共享断路器允许一个失败的依赖项阻塞对所有其他健康依赖项的调用",
      "在重试延迟中使用抖动，因为没有抖动，所有重试客户端会同步其重试时间，并创建可能压垮恢复中服务的雷鸣群",
      "根据依赖项的预期恢复时间而非任意默认值设置断路器超时，因为过短的超时会导致在依赖项恢复前过早关闭断路器",
      "使用混沌工程工具（Chaos Monkey、Gremlin）在负载下测试断路器行为，因为未经测试的断路器在生产条件下会以意想不到的方式失败"
    ],
    "donts": [
      "Don't retry non-idempotent operations (POST, DELETE with side effects) without idempotency keys because each retry may duplicate the business action",
      "Don't set retry counts too high because aggressive retries amplify load on a struggling service and delay the circuit breaker from tripping when a dependency is genuinely down",
      "Don't share a circuit breaker thread pool across all callers because a slow dependency that exhausts the pool will block all other requests in the service, regardless of their target",
      "Don't disable circuit breakers in production to 'fix' a broken integration because the circuit breaker is revealing a dependency problem — disabling it causes cascading failure instead of graceful degradation"
    ],
    "donts_zh": [
      "不要在没有幂等键的情况下重试非幂等操作（带有副作用的POST、DELETE），因为每次重试都可能重复业务操作",
      "不要将重试次数设置得太高，因为激进的重试会放大对困难服务的负载，并延迟断路器在依赖项真正宕机时跳闸",
      "不要在所有调用方之间共享断路器线程池，因为耗尽池的慢依赖项将阻塞服务中的所有其他请求，无论其目标是什么",
      "不要在生产中禁用断路器来「修复」损坏的集成，因为断路器揭示的是依赖项问题——禁用它会导致级联故障而非优雅降级"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix's Hystrix library powered circuit breaking across hundreds of microservices in their video streaming platform. The most famous application is their API gateway, which calls dozens of downstream services (recommendations, user profiles, billing, content catalog) to assemble the home screen. Each dependency is wrapped in its own Hystrix command with a dedicated thread pool. When the recommendations service degrades during peak traffic, Hystrix trips its circuit and the API gateway falls back to a curated editorial list instead of crashing the entire home screen. Netflix reported that during their 2012 Christmas peak, over 3.5 billion Hystrix fallback calls were executed, keeping the service available for millions of users while individual dependencies experienced failures.",
    "case_study_zh": "Netflix的Hystrix库为其视频流平台数百个微服务提供断路支持。最著名的应用是其API网关，它调用数十个下游服务（推荐、用户资料、计费、内容目录）来组装主屏幕。每个依赖项都包装在自己的Hystrix命令中，带有专用线程池。当推荐服务在峰值流量期间降级时，Hystrix跳闸其断路器，API网关回退到精心策划的编辑列表，而不是使整个主屏幕崩溃。Netflix报告称，在2012年圣诞节峰值期间，执行了超过35亿次Hystrix回退调用，在各个依赖项遭受故障时使数百万用户的服务保持可用。",
    "when_not_to_use": [
      "Internal in-process method calls that do not cross network boundaries because the overhead of a circuit breaker is unnecessary for calls that cannot fail due to network issues",
      "Asynchronous message-based communication where at-least-once delivery and consumer retries already handle transient failures at the messaging layer",
      "Database connections within a single service where connection pooling and timeout configuration provide sufficient resilience without circuit breaker complexity",
      "When the downstream service is the organization's own service with very high availability SLAs — invest in improving the service's reliability rather than masking its failures with a circuit breaker"
    ],
    "when_not_to_use_zh": [
      "不跨越网络边界的进程内方法调用，因为对于不会因网络问题而失败的调用，断路器的开销是不必要的",
      "至少一次交付和消费者重试已经在消息层处理瞬态故障的异步基于消息的通信",
      "单个服务内的数据库连接，连接池和超时配置提供足够的弹性而不需要断路器复杂性",
      "当下游服务是具有非常高可用性SLA的组织自己的服务时——投资提高服务的可靠性，而不是用断路器掩盖其故障"
    ],
    "adopters": [
      "Netflix",
      "Amazon",
      "Alibaba (Sentinel)",
      "Microsoft (Polly/.NET)",
      "Spring Cloud (Resilience4j)"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability",
      "performance"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Nygard, M.T. (2018). \"Release It!\", 2nd ed., Ch. 5: Stability Patterns. Pragmatic Bookshelf.",
    "secondary_sources": [
      "Fowler, M. (2014). \"CircuitBreaker\". martinfowler.com/bliki/CircuitBreaker.html.",
      "Netflix (2012). \"Hystrix: Latency and Fault Tolerance for Distributed Systems\". github.com/Netflix/Hystrix.",
      "Resilience4j Project (2019). \"Resilience4j Documentation\". resilience4j.readme.io."
    ],
    "typed_relations": [
      {
        "slug": "bulkhead-service-level",
        "type": "complement"
      },
      {
        "slug": "service-discovery",
        "type": "complement"
      },
      {
        "slug": "idempotency-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 209,
    "name": "Bulkhead at Service Level",
    "name_zh": "服务级隔离舱",
    "slug": "bulkhead-service-level",
    "category": "distributed",
    "desc": "Isolating service resources to prevent cascading failures (different from code-level bulkhead in quality.json)",
    "desc_zh": "隔离服务资源以防止级联故障（区别于quality.json中代码级别的隔离舱）",
    "steps": [
      "Identify the critical resource pools in the service that can be exhausted by a single misbehaving dependency or tenant: thread pools, connection pools, memory limits, and rate-limited API quotas",
      "Partition resource pools by consumer or dependency: assign dedicated thread pools to each downstream dependency call (Hystrix semaphore/thread isolation), or partition connection pools by calling service identity",
      "Set capacity limits on each bulkhead partition: define the maximum concurrent requests, maximum queue depth, and timeout for each pool so that no single consumer can exceed its allocation even if it sends unlimited requests",
      "Implement tenant-level bulkheads for multi-tenant services: allocate separate rate limits, request queues, and processing resources per tenant so that a high-volume tenant cannot starve resources for other tenants",
      "Monitor bulkhead utilization, queue depth, and rejection rates per partition in observability tooling; tune partition sizes based on observed traffic patterns and ensure that the sum of all partition limits does not exceed total resource capacity"
    ],
    "steps_zh": [
      "识别服务中可能被单个行为异常的依赖项或租户耗尽的关键资源池：线程池、连接池、内存限制和受速率限制的API配额",
      "按消费者或依赖项划分资源池：为每个下游依赖项调用分配专用线程池（Hystrix信号量/线程隔离），或按调用服务身份划分连接池",
      "设置每个隔离舱分区的容量限制：为每个池定义最大并发请求数、最大队列深度和超时，使任何单个消费者即使发送无限请求也无法超过其分配",
      "为多租户服务实施租户级隔离舱：按租户分配单独的速率限制、请求队列和处理资源，使高流量租户无法使其他租户的资源饥饿",
      "在可观测性工具中监控每个分区的隔离舱利用率、队列深度和拒绝率；根据观察到的流量模式调整分区大小，并确保所有分区限制之和不超过总资源容量"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Resource Pools",
      "Partition Pools",
      "Capacity Limits",
      "Tenant Bulkhead",
      "Monitor"
    ],
    "viz_labels_zh": [
      "资源池",
      "分区隔离",
      "容量限制",
      "租户隔离",
      "使用监控"
    ],
    "related": [
      "circuit-breaker-with-retry",
      "service-discovery",
      "consistent-hashing"
    ],
    "tags": [
      "bulkhead",
      "isolation",
      "resilience",
      "multi-tenant",
      "resource-partitioning"
    ],
    "origin_author": "Michael Nygard (2007); popularized by Netflix Hystrix thread pool isolation (2012)",
    "origin_source": "Nygard, M.T. (2018). \"Release It!\", 2nd ed., Ch. 5. Pragmatic Bookshelf; Netflix Hystrix Bulkhead documentation",
    "origin_source_zh": "Nygard, M.T.（2018）。《Release It!》第2版第5章，Pragmatic Bookshelf；Netflix Hystrix隔离舱文档",
    "complexity": "intermediate",
    "when_to_use": [
      "When a single slow or failing downstream dependency can exhaust the service's shared thread pool and make all other API endpoints unresponsive",
      "When a multi-tenant service must guarantee fair resource allocation so that a noisy tenant does not starve resources for others",
      "When different API endpoints have vastly different latency profiles and fast, latency-sensitive endpoints share a thread pool with slow batch endpoints",
      "When a service has multiple integration points with third-party APIs that have independent failure modes and rate limits"
    ],
    "when_to_use_zh": [
      "当单个缓慢或失败的下游依赖项可能耗尽服务的共享线程池，使所有其他API端点无响应时",
      "当多租户服务必须保证公平的资源分配，使嘈杂的租户不会使其他租户的资源饥饿时",
      "当不同的API端点具有截然不同的延迟特性，且快速的延迟敏感端点与慢速批处理端点共享线程池时",
      "当服务具有多个与具有独立故障模式和速率限制的第三方API的集成点时"
    ],
    "core_concepts": [
      "Resource partition: a dedicated, bounded allocation of a resource type (threads, connections, memory) assigned to a specific consumer, dependency, or tenant, preventing cross-partition interference",
      "Thread pool isolation: assigning a dedicated thread pool to each downstream dependency call; if the dependency blocks, only its pool is exhausted, not the service's main execution pool",
      "Semaphore isolation: limiting the number of concurrent in-flight requests to a dependency using a semaphore rather than a separate thread pool; lower overhead than thread isolation but no timeout enforcement",
      "Shed load: when a bulkhead's queue reaches capacity, new requests are immediately rejected with an explicit error (503 or rate limit response) rather than queuing indefinitely and causing latency to grow unbounded"
    ],
    "core_concepts_zh": [
      "资源分区：分配给特定消费者、依赖项或租户的专用、有界资源类型（线程、连接、内存）分配，防止跨分区干扰",
      "线程池隔离：为每个下游依赖项调用分配专用线程池；如果依赖项阻塞，只有其池被耗尽，而不是服务的主执行池",
      "信号量隔离：使用信号量而非单独的线程池限制对依赖项的并发进行中请求数量；比线程隔离开销更低，但无法强制超时",
      "减负：当隔离舱的队列达到容量时，新请求立即以明确的错误（503或速率限制响应）被拒绝，而非无限排队导致延迟无限增长"
    ],
    "timeline": [
      [
        "2007",
        "Michael Nygard introduces the Bulkhead pattern in 'Release It!', drawing the analogy to ship hull compartmentalization for software system isolation"
      ],
      [
        "2012",
        "Netflix Hystrix implements thread pool isolation as its primary bulkhead mechanism, making the pattern production-proven at massive scale"
      ],
      [
        "2017",
        "Kubernetes resource limits (CPU/memory per pod) and namespaces provide infrastructure-level bulkheads for multi-tenant clusters"
      ],
      [
        "2020",
        "Service meshes (Istio) implement connection pool settings and concurrency limits at the infrastructure layer, providing bulkhead behavior without application code changes"
      ]
    ],
    "timeline_zh": [
      [
        "2007",
        "Michael Nygard在《Release It!》中引入隔离舱模式，将船体舱室隔离的类比用于软件系统隔离"
      ],
      [
        "2012",
        "Netflix Hystrix将线程池隔离作为其主要隔离舱机制，使该模式在大规模生产中得到验证"
      ],
      [
        "2017",
        "Kubernetes资源限制（每个Pod的CPU/内存）和命名空间为多租户集群提供基础设施级隔离舱"
      ],
      [
        "2020",
        "服务网格（Istio）在基础设施层实施连接池设置和并发限制，在不更改应用程序代码的情况下提供隔离舱行为"
      ]
    ],
    "dos": [
      "Do size bulkhead partitions based on measured traffic data rather than intuition because over-partitioning wastes resources while under-partitioning creates artificial bottlenecks",
      "Do return meaningful rejection errors (503 Service Unavailable with Retry-After) when bulkhead capacity is exceeded so clients can implement appropriate backoff",
      "Do combine bulkheads with circuit breakers because bulkheads limit the blast radius of a failure while circuit breakers prevent repeated attempts against a known-failed dependency",
      "Do test bulkhead behavior with load tests that simulate tenant isolation scenarios and third-party API slowdowns to verify that partitions behave correctly under stress"
    ],
    "dos_zh": [
      "根据测量的流量数据而非直觉调整隔离舱分区大小，因为过度分区浪费资源，而分区不足会造成人为瓶颈",
      "当隔离舱容量超出时返回有意义的拒绝错误（503服务不可用 + Retry-After），以便客户端可以实施适当的退避",
      "将隔离舱与断路器结合，因为隔离舱限制故障的爆炸半径，而断路器防止对已知失败依赖项的重复尝试",
      "使用模拟租户隔离场景和第三方API减速的负载测试来测试隔离舱行为，以验证分区在压力下正确运行"
    ],
    "donts": [
      "Don't create too many small partitions because context switching between many small thread pools introduces overhead that can degrade performance more than the isolation benefit provides",
      "Don't set partition limits so large that they effectively share resources — a bulkhead that allows 95% of total threads defeats the isolation purpose",
      "Don't implement bulkheads only at the thread pool level because network connection pools, database connection pools, and outbound API rate limits also need partitioning for true isolation",
      "Don't confuse bulkhead rejection (immediate fail-fast) with throttling (delayed processing) because bulkhead rejections are intentional load shedding, not queuing — clients must retry with backoff"
    ],
    "donts_zh": [
      "不要创建太多小分区，因为在许多小线程池之间切换上下文会引入开销，可能比隔离收益更大地降低性能",
      "不要将分区限制设置得太大以至于有效共享资源——允许95%总线程的隔离舱失去了隔离目的",
      "不要仅在线程池级别实施隔离舱，因为网络连接池、数据库连接池和出站API速率限制也需要分区才能实现真正隔离",
      "不要将隔离舱拒绝（立即快速失败）与节流（延迟处理）混淆，因为隔离舱拒绝是有意的负载减少，而非排队——客户端必须使用退避重试"
    ],
    "case_study_company": "Amazon",
    "case_study": "Amazon's retail website uses bulkhead isolation to prevent slow third-party integrations from degrading the core shopping experience. The product detail page aggregates data from dozens of internal and external services (reviews, inventory, pricing, recommendations, seller data). Each data source is assigned a dedicated connection pool and thread pool with a strict timeout. When a third-party seller data API experiences latency spikes during peak events like Prime Day, its bulkhead is exhausted and the page falls back to cached seller information, leaving all other page components (price, add-to-cart, reviews) fully functional. Amazon reports that bulkhead isolation prevents over 99% of third-party API degradations from impacting their core conversion metrics.",
    "case_study_zh": "亚马逊零售网站使用隔离舱隔离防止慢速第三方集成降低核心购物体验。产品详情页面汇总来自数十个内部和外部服务（评论、库存、定价、推荐、卖家数据）的数据。每个数据源都分配了带有严格超时的专用连接池和线程池。当第三方卖家数据API在Prime Day等峰值事件期间遇到延迟尖峰时，其隔离舱被耗尽，页面回退到缓存的卖家信息，使所有其他页面组件（价格、加购、评论）完全正常运行。亚马逊报告称，隔离舱隔离防止了99%以上的第三方API降级影响其核心转化指标。",
    "when_not_to_use": [
      "Single-threaded or event-loop based services (Node.js, asyncio) where thread pool isolation is not applicable — use semaphore-based concurrency limits and async timeout instead",
      "Monolithic applications where all components share the same process and true resource isolation requires OS-level process or container boundaries",
      "When all downstream dependencies have identical performance characteristics and there is no differentiation needed between fast and slow paths",
      "When the service only calls a single downstream dependency and there is no multi-tenancy to isolate — the complexity of bulkhead partitioning is not justified"
    ],
    "when_not_to_use_zh": [
      "基于单线程或事件循环的服务（Node.js、asyncio），线程池隔离不适用——改用基于信号量的并发限制和异步超时",
      "所有组件共享同一进程且真正的资源隔离需要操作系统级进程或容器边界的单体应用程序",
      "当所有下游依赖项具有相同的性能特征，快速路径和慢速路径之间不需要差异化时",
      "当服务只调用单个下游依赖项且没有需要隔离的多租户时——隔离舱分区的复杂性不合理"
    ],
    "adopters": [
      "Amazon",
      "Netflix",
      "Microsoft (Azure Service Fabric)",
      "Uber",
      "Istio service mesh users"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability",
      "performance"
    ],
    "maturity_ring": "established",
    "primary_source": "Nygard, M.T. (2018). \"Release It!\", 2nd ed., Ch. 5: Stability Patterns — Bulkheads. Pragmatic Bookshelf.",
    "secondary_sources": [
      "Netflix (2012). \"Hystrix: How it Works — Thread Pool Isolation\". github.com/Netflix/Hystrix/wiki.",
      "Newman, S. (2021). \"Building Microservices\", 2nd ed., Ch. 12. O'Reilly Media.",
      "Richardson, C. (2018). \"Microservices Patterns\", Ch. 3. Manning Publications."
    ],
    "typed_relations": [
      {
        "slug": "circuit-breaker-with-retry",
        "type": "complement"
      },
      {
        "slug": "service-discovery",
        "type": "complement"
      },
      {
        "slug": "consistent-hashing",
        "type": "related"
      }
    ]
  },
  {
    "id": 264,
    "name": "Raft Consensus Algorithm",
    "name_zh": "Raft 共识算法",
    "slug": "raft-consensus",
    "category": "distributed",
    "desc": "Understandable consensus protocol using leader election and log replication to achieve fault-tolerant distributed agreement",
    "desc_zh": "通过领导者选举和日志复制实现容错分布式一致性的可理解共识协议",
    "steps": [
      "Partition time into terms: each term begins with a leader election triggered by follower timeout; terms act as logical clocks that prevent stale commands from being accepted",
      "Elect a leader via randomized timeouts: followers wait a random interval before requesting votes; the first candidate to reach a majority quorum becomes leader for the term",
      "Replicate log entries through AppendEntries RPCs: the leader appends a client command to its own log, then broadcasts AppendEntries to all followers and waits for majority acknowledgment",
      "Commit entries once a quorum responds: when a majority of nodes have written the entry to stable storage, the leader marks it committed, applies it to the state machine, and responds to the client",
      "Handle crashes and rejoin with log reconciliation: when a crashed node restarts or a new leader is elected, the leader backtracks its nextIndex for each follower until their logs align, then streams missing entries forward"
    ],
    "steps_zh": [
      "将时间划分为任期：每个任期以跟随者超时触发的领导者选举开始；任期作为逻辑时钟，防止过期命令被接受",
      "通过随机超时选举领导者：跟随者等待一个随机间隔后请求投票；第一个获得多数派的候选人成为该任期的领导者",
      "通过 AppendEntries RPC 复制日志条目：领导者将客户端命令追加到自身日志，然后广播 AppendEntries 给所有跟随者，等待多数派确认",
      "获得法定人数响应后提交条目：当多数节点将条目写入稳定存储后，领导者标记其为已提交，应用到状态机并响应客户端",
      "通过日志对账处理崩溃和重新加入：当崩溃节点重启或新领导者选出时，领导者为每个跟随者回溯 nextIndex 直至日志对齐，然后向前传输缺失条目"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Terms",
      "Leader Election",
      "AppendEntries",
      "Commit Quorum",
      "Log Reconcile"
    ],
    "viz_labels_zh": [
      "任期",
      "领导选举",
      "日志追加",
      "多数提交",
      "日志对齐"
    ],
    "related": [
      "consensus-protocols",
      "leader-election",
      "two-phase-commit",
      "eventual-consistency"
    ],
    "tags": [
      "consensus",
      "raft",
      "leader-election",
      "log-replication",
      "fault-tolerance"
    ],
    "origin_author": "Diego Ongaro",
    "origin_source": "Ongaro, D. & Ousterhout, J. (2014). In Search of an Understandable Consensus Algorithm. USENIX ATC 2014; Ongaro, D. (2014). Consensus: Bridging Theory and Practice. Stanford PhD dissertation.",
    "origin_source_zh": "Ongaro与Ousterhout（2014）《寻找一种可理解的共识算法》USENIX ATC 2014；Ongaro（2014）《共识：架桥理论与实践》斯坦福博士论文",
    "complexity": "advanced",
    "when_to_use": [
      "When building a replicated state machine that must tolerate minority node failures without data loss or split-brain",
      "When implementing a distributed coordination service such as a configuration store, distributed lock, or cluster membership registry",
      "When you need a consensus algorithm that your team can fully understand, audit, and implement correctly",
      "When deploying a clustered database or message broker that requires automatic leader failover without manual operator intervention",
      "When the write path can afford quorum latency and strong linearizability guarantees outweigh throughput maximization"
    ],
    "when_to_use_zh": [
      "构建必须容忍少数节点故障且不丢失数据或产生脑裂的复制状态机时",
      "实现配置存储、分布式锁或集群成员注册表等分布式协调服务时",
      "需要团队能够完全理解、审计和正确实现的共识算法时",
      "部署需要自动领导者故障转移、无需人工干预的集群数据库或消息代理时",
      "写路径可以承受法定人数延迟且强线性一致性保证优先于吞吐量最大化时"
    ],
    "core_concepts": [
      "Leader supremacy: only the current-term leader accepts client writes and replicates them; followers are read-only participants that redirect writes to the leader",
      "Term monotonicity: every RPC carries a term number; nodes reject messages from lower terms and step down if they see a higher term, preventing stale leaders from corrupting state",
      "Log matching property: if two logs contain an entry with the same index and term, then the logs are identical in all entries up to that index, enabling safe replication",
      "Election safety: at most one leader can be elected per term because winning requires votes from a quorum that overlaps with any other potential quorum",
      "Log compaction via snapshots: once logs grow too large, the state machine state is snapshotted, older log entries are discarded, and lagging followers receive snapshots instead of full log replay"
    ],
    "core_concepts_zh": [
      "领导者至上：只有当前任期的领导者接受客户端写入并复制它们；跟随者是只读参与者，将写入重定向到领导者",
      "任期单调性：每个 RPC 携带任期号；节点拒绝来自更低任期的消息，若看到更高任期则降级，防止过期领导者破坏状态",
      "日志匹配属性：若两个日志包含相同索引和任期的条目，则这两个日志在该索引之前的所有条目完全相同，保证安全复制",
      "选举安全性：每个任期最多选出一个领导者，因为获胜需要来自法定人数的投票，该法定人数与任何其他潜在法定人数有交集",
      "通过快照压缩日志：日志过大时，对状态机状态打快照，丢弃旧日志条目，落后的跟随者接收快照而非完整日志回放"
    ],
    "timeline": [
      [
        "2013",
        "Diego Ongaro presents the initial Raft design at USENIX HotOS XIII as part of his Stanford PhD research"
      ],
      [
        "2014",
        "Ongaro and Ousterhout publish 'In Search of an Understandable Consensus Algorithm' at USENIX ATC, introducing Raft to the wider community"
      ],
      [
        "2014",
        "Ongaro completes his PhD dissertation 'Consensus: Bridging Theory and Practice', providing the comprehensive Raft specification"
      ],
      [
        "2015",
        "etcd, CoreOS's Raft-based key-value store, becomes the state store for Kubernetes, making Raft the de facto consensus backbone for cloud-native infrastructure"
      ]
    ],
    "timeline_zh": [
      [
        "2013",
        "Diego Ongaro 在 USENIX HotOS XIII 上首次展示 Raft 设计，作为其斯坦福博士研究的一部分"
      ],
      [
        "2014",
        "Ongaro 与 Ousterhout 在 USENIX ATC 发表「寻找一种可理解的共识算法」，向更广泛的社区介绍 Raft"
      ],
      [
        "2014",
        "Ongaro 完成博士论文「共识：架桥理论与实践」，提供了完整的 Raft 规范"
      ],
      [
        "2015",
        "CoreOS 基于 Raft 的键值存储 etcd 成为 Kubernetes 的状态存储，使 Raft 成为云原生基础设施的事实共识骨干"
      ]
    ],
    "dos": [
      "Do use an odd number of nodes (3, 5, 7) to guarantee a clear majority quorum and avoid split-vote deadlocks",
      "Do persist the current term, voted-for, and log entries to stable storage before responding to any RPC to ensure correct recovery after crashes",
      "Do implement leader lease reads or linearizable read index to serve reads without additional log entries when read-heavy workloads dominate",
      "Do monitor election frequency and leader stability because frequent elections indicate network instability or misconfigured heartbeat intervals"
    ],
    "dos_zh": [
      "使用奇数个节点（3、5、7）以保证清晰的多数派法定人数，避免分裂投票死锁",
      "在响应任何 RPC 之前将当前任期、已投票对象和日志条目持久化到稳定存储，确保崩溃后正确恢复",
      "在读密集型工作负载为主时实现领导者租约读取或线性一致读取索引，无需额外日志条目即可服务读请求",
      "监控选举频率和领导者稳定性，因为频繁选举表明网络不稳定或心跳间隔配置错误"
    ],
    "donts": [
      "Don't run Raft clusters across wide-area networks without accounting for inter-datacenter latency because quorum writes block on the slowest acknowledgment in the majority",
      "Don't skip log compaction implementation because unbounded log growth will exhaust disk and make follower recovery prohibitively slow",
      "Don't assume Raft handles sharding or partitioning — it manages a single replicated log and must be combined with a shard routing layer for horizontal scale",
      "Don't mix Raft cluster members across significantly different hardware because persistent storage speed differences create chronic follower lag"
    ],
    "donts_zh": [
      "不要在不考虑数据中心间延迟的情况下跨广域网运行 Raft 集群，因为法定人数写入会阻塞在多数派中最慢的确认上",
      "不要跳过日志压缩实现，因为无限日志增长会耗尽磁盘并使跟随者恢复变得极慢",
      "不要假设 Raft 处理分片或分区——它管理单个复制日志，水平扩展必须结合分片路由层",
      "不要将 Raft 集群成员混合在差异显著的硬件上，因为持久化存储速度差异会造成长期跟随者滞后"
    ],
    "case_study_company": "CockroachDB",
    "case_study": "CockroachDB uses Raft at the core of its range-based replication model. Each key range (default 64 MB) is managed by a separate Raft group of three replicas. When a node fails, the Raft group for affected ranges elects a new leader and continues accepting writes within seconds, with no operator intervention. CockroachDB's engineers specifically chose Raft over Paxos because the algorithm's clean separation of leader election, log replication, and safety invariants made it tractable to implement correctly in a production database — a decision validated by successfully passing Jepsen distributed systems testing.",
    "case_study_zh": "CockroachDB 在其基于范围的复制模型核心使用 Raft。每个键范围（默认 64 MB）由一个独立的三副本 Raft 组管理。当节点故障时，受影响范围的 Raft 组在几秒内选出新领导者并继续接受写入，无需运维干预。CockroachDB 的工程师特别选择 Raft 而非 Paxos，因为算法对领导者选举、日志复制和安全不变量的清晰分离使其在生产数据库中正确实现成为可能——这一决定通过了 Jepsen 分布式系统测试得到验证。",
    "when_not_to_use": [
      "High-throughput write-heavy workloads where quorum latency per write is unacceptable and eventual consistency suffices",
      "Geographically distributed multi-region deployments where cross-region quorum writes introduce hundreds of milliseconds of latency",
      "Single-node embedded systems or applications with no replication requirement where consensus overhead is pure waste"
    ],
    "when_not_to_use_zh": [
      "每次写入的法定人数延迟不可接受且最终一致性已足够的高吞吐写密集型工作负载",
      "跨区域法定人数写入引入数百毫秒延迟的地理分布式多区域部署",
      "没有复制需求的单节点嵌入式系统或应用，共识开销纯属浪费"
    ],
    "adopters": [
      "CockroachDB",
      "etcd/Kubernetes",
      "TiKV/TiDB",
      "Consul (HashiCorp)",
      "Dgraph"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Ongaro, D. & Ousterhout, J. (2014). \"In Search of an Understandable Consensus Algorithm\". Proceedings of USENIX ATC 2014.",
    "secondary_sources": [
      "Ongaro, D. (2014). \"Consensus: Bridging Theory and Practice\". Stanford University PhD Dissertation.",
      "Howard, H., Schwarzkopf, M., Madhavapeddy, A. & Crowcroft, J. (2016). \"Flexible Paxos: Quorum Intersection Revisited\". arXiv:1608.06696.",
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 9. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "consensus-protocols",
        "type": "alternative"
      },
      {
        "slug": "leader-election",
        "type": "complement"
      },
      {
        "slug": "two-phase-commit",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 265,
    "name": "CRDTs (Conflict-free Replicated Data Types)",
    "name_zh": "无冲突复制数据类型（CRDT）",
    "slug": "crdts",
    "category": "distributed",
    "desc": "Eventual consistency without coordination using algebraic data structures that merge automatically",
    "desc_zh": "使用可自动合并的代数数据结构实现无需协调的最终一致性",
    "steps": [
      "Identify data types that require multi-writer concurrent updates: shopping carts, collaborative text, counters, presence sets, or configuration flags under concurrent modification",
      "Choose the appropriate CRDT variant: G-Counter for monotone increments, PN-Counter for increment/decrement, OR-Set for add-wins semantics, or sequence CRDTs (RGA, LSEQ) for ordered text",
      "Implement the merge function: define a join operation that is commutative, associative, and idempotent so any two replicas converge to the same state regardless of message order or duplicates",
      "Propagate state or operations: use state-based CRDTs that gossip full state snapshots or operation-based CRDTs that broadcast individual ops over a reliable causal delivery channel",
      "Monitor divergence and convergence: track vector clocks or version vectors to detect out-of-order delivery, measure replica lag, and validate that all nodes eventually reach the same state"
    ],
    "steps_zh": [
      "识别需要多写入方并发更新的数据类型：购物车、协作文本、计数器、在线状态集合，或并发修改下的配置标志",
      "选择适当的 CRDT 变体：G-Counter 用于单调递增，PN-Counter 用于增减操作，OR-Set 用于添加优先语义，序列 CRDT（RGA、LSEQ）用于有序文本",
      "实现合并函数：定义满足交换律、结合律和幂等性的 join 操作，使任意两个副本无论消息顺序或重复如何都能收敛到相同状态",
      "传播状态或操作：使用基于状态的 CRDT（gossip 完整状态快照）或基于操作的 CRDT（通过可靠因果传递通道广播单个操作）",
      "监控分歧与收敛：追踪向量时钟或版本向量以检测乱序投递，衡量副本滞后，验证所有节点最终达到相同状态"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Concurrent Writes",
      "CRDT Variant",
      "Merge Function",
      "Propagation",
      "Divergence Monitor"
    ],
    "viz_labels_zh": [
      "并发写入",
      "CRDT变体",
      "合并函数",
      "状态传播",
      "收敛监控"
    ],
    "related": [
      "eventual-consistency",
      "gossip-protocol",
      "cap-theorem",
      "consensus-protocols"
    ],
    "tags": [
      "crdt",
      "eventual-consistency",
      "conflict-resolution",
      "collaborative-editing",
      "distributed-data"
    ],
    "origin_author": "Marc Shapiro",
    "origin_source": "Shapiro, M., Preguica, N., Baquero, C. & Zawirski, M. (2011). A Comprehensive Study of Convergent and Commutative Replicated Data Types. INRIA Technical Report RR-7506.",
    "origin_source_zh": "Shapiro等（2011）《收敛与交换复制数据类型综合研究》INRIA技术报告RR-7506",
    "complexity": "advanced",
    "when_to_use": [
      "When multiple nodes or users must write to the same data concurrently without requiring coordination or locks",
      "When network partitions must not block writes — all nodes should accept mutations and merge later",
      "When building collaborative real-time applications such as shared documents, multiplayer games, or distributed configuration stores",
      "When the data type has a natural monotone lattice structure (counters, sets, maps, sequences) that maps cleanly to a CRDT variant",
      "When you want to eliminate distributed transactions for specific data types and accept the constraints of the merge semantics"
    ],
    "when_to_use_zh": [
      "当多个节点或用户必须并发写入同一数据而不需要协调或锁时",
      "当网络分区不能阻塞写入——所有节点应接受变更并在之后合并时",
      "构建协作实时应用（如共享文档、多人游戏或分布式配置存储）时",
      "当数据类型具有自然单调格结构（计数器、集合、映射、序列），能干净地映射到 CRDT 变体时",
      "当你希望为特定数据类型消除分布式事务并接受合并语义约束时"
    ],
    "core_concepts": [
      "Lattice: A partially ordered set with a join (least upper bound) operation; CRDT state forms a lattice where the merge function computes the join of two states",
      "Commutativity, associativity, idempotency (CAI): The three algebraic properties a merge function must satisfy to guarantee convergence regardless of message delivery order or duplication",
      "State-based vs. operation-based: State-based (CvRDT) replicas exchange full snapshots and apply merge; operation-based (CmRDT) replicas broadcast operations that require exactly-once or causal delivery",
      "Causal consistency: Operation-based CRDTs often rely on causal broadcast so that a remove operation is never applied before the add it references, preventing phantom deletions",
      "Delta CRDTs: An optimization that propagates only the state delta since the last sync rather than the full state, dramatically reducing bandwidth for large state-based CRDTs"
    ],
    "core_concepts_zh": [
      "格：具有 join（最小上界）操作的偏序集；CRDT 状态形成格，合并函数计算两个状态的 join",
      "交换律、结合律、幂等性（CAI）：合并函数必须满足的三个代数属性，保证无论消息投递顺序或重复如何都能收敛",
      "基于状态与基于操作：基于状态（CvRDT）副本交换完整快照并应用合并；基于操作（CmRDT）副本广播操作，需要恰好一次或因果投递",
      "因果一致性：基于操作的 CRDT 通常依赖因果广播，确保删除操作永远不会在其引用的添加操作之前应用，防止幽灵删除",
      "Delta CRDT：一种优化，只传播自上次同步以来的状态增量而非完整状态，大幅减少大型基于状态 CRDT 的带宽消耗"
    ],
    "timeline": [
      [
        "2007",
        "Shapiro, Preguica, and colleagues begin formalizing the mathematical foundations of CRDTs at INRIA"
      ],
      [
        "2011",
        "Shapiro et al. publish the comprehensive CRDT survey (RR-7506), establishing the theoretical taxonomy of CvRDTs and CmRDTs"
      ],
      [
        "2012",
        "Basho adopts CRDTs in Riak 2.0 for counters, sets, and maps — the first major production database to ship built-in CRDT types"
      ],
      [
        "2017",
        "Apple's iCloud Notes, Figma's multiplayer canvas, and Notion all adopt CRDT-based synchronization for real-time collaborative editing"
      ]
    ],
    "timeline_zh": [
      [
        "2007",
        "Shapiro、Preguica 及同事开始在 INRIA 形式化 CRDT 的数学基础"
      ],
      [
        "2011",
        "Shapiro 等发表全面的 CRDT 综述（RR-7506），建立 CvRDT 和 CmRDT 的理论分类体系"
      ],
      [
        "2012",
        "Basho 在 Riak 2.0 中为计数器、集合和映射采用 CRDT——第一个内置 CRDT 类型的主要生产数据库"
      ],
      [
        "2017",
        "Apple iCloud Notes、Figma 多人画布和 Notion 均采用基于 CRDT 的同步机制实现实时协作编辑"
      ]
    ],
    "dos": [
      "Do choose the semantics carefully before selecting a CRDT: add-wins vs. remove-wins sets have different user-visible behavior that must match your product requirements",
      "Do use delta CRDTs for state-based designs in bandwidth-constrained environments to avoid sending full state snapshots on every sync",
      "Do combine CRDTs with version vectors or hybrid logical clocks so you can track causality and detect stale state during reconciliation",
      "Do test your merge function algebraically with property-based testing to verify commutativity, associativity, and idempotency for all inputs"
    ],
    "dos_zh": [
      "在选择 CRDT 之前仔细选择语义：添加优先与删除优先集合有不同的用户可见行为，必须与产品需求匹配",
      "在带宽受限环境中对基于状态的设计使用 Delta CRDT，避免每次同步都发送完整状态快照",
      "将 CRDT 与版本向量或混合逻辑时钟结合，以便在协调期间追踪因果关系并检测过期状态",
      "使用基于属性的测试对合并函数进行代数测试，验证所有输入的交换律、结合律和幂等性"
    ],
    "donts": [
      "Don't use CRDTs for financial transactions or any domain requiring strict linearizability — CRDT semantics can silently merge conflicting writes in ways that violate business invariants",
      "Don't ignore the garbage collection problem: OR-Sets and sequence CRDTs accumulate tombstones that must be periodically pruned to prevent unbounded memory growth",
      "Don't assume operation-based CRDTs work correctly over unreliable transports — they require causal or exactly-once delivery; using them over UDP without ordering guarantees causes corruption",
      "Don't hand-roll complex CRDTs in production without formal verification or an established library — subtle violations of the CAI properties cause hard-to-reproduce divergence bugs"
    ],
    "donts_zh": [
      "不要将 CRDT 用于金融事务或任何需要严格线性一致性的领域——CRDT 语义可能以违反业务不变量的方式静默合并冲突写入",
      "不要忽视垃圾回收问题：OR-Set 和序列 CRDT 会积累墓碑标记，必须定期清理以防止内存无限增长",
      "不要假设基于操作的 CRDT 能在不可靠传输上正确工作——它们需要因果或恰好一次投递；在无序序保证的 UDP 上使用会导致数据损坏",
      "不要在生产中手动实现复杂的 CRDT 而不进行形式化验证或使用成熟库——CAI 属性的细微违反会导致难以复现的分歧缺陷"
    ],
    "case_study_company": "Figma",
    "case_study": "Figma's multiplayer editing engine uses CRDT-inspired data structures to allow dozens of designers to simultaneously edit the same file without locks or coordination. Each design object (frame, layer, property) is modeled as an independent CRDT — changes to position, color, and hierarchy are represented as operation logs that merge deterministically. When a user goes offline and edits a shared component, their changes are buffered locally and merged with the server state on reconnect without conflicts. This architecture allows Figma to support real-time collaboration across 50+ simultaneous cursors in a single file, a capability that would require complex distributed locking under traditional approaches.",
    "case_study_zh": "Figma 的多人编辑引擎使用受 CRDT 启发的数据结构，允许数十名设计师无需锁或协调地同时编辑同一文件。每个设计对象（框架、图层、属性）被建模为独立的 CRDT——位置、颜色和层级的变更表示为可确定性合并的操作日志。用户离线编辑共享组件时，变更在本地缓冲，重新连接后与服务器状态无冲突合并。这一架构使 Figma 能在单个文件中支持 50+ 个同时在线光标的实时协作，而传统方式需要复杂的分布式锁。",
    "when_not_to_use": [
      "Financial systems, inventory management, or any domain where concurrent writes must respect strong business invariants that cannot be expressed as a lattice join",
      "Simple single-writer or leader-replica architectures where coordination is cheap and CRDT complexity adds no benefit",
      "Data with complex relational integrity constraints (foreign keys, uniqueness) that cannot be expressed as monotone lattice operations"
    ],
    "when_not_to_use_zh": [
      "金融系统、库存管理或任何并发写入必须遵守无法表示为格 join 的强业务不变量的领域",
      "简单的单写入方或领导者-副本架构，协调代价低廉且 CRDT 复杂性没有收益",
      "具有复杂关系完整性约束（外键、唯一性）且无法表示为单调格操作的数据"
    ],
    "adopters": [
      "Riak (Basho)",
      "Figma",
      "Apple iCloud",
      "Redis Enterprise",
      "SoundCloud (Roshi)"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability",
      "scalability"
    ],
    "maturity_ring": "established",
    "primary_source": "Shapiro, M., Preguica, N., Baquero, C. & Zawirski, M. (2011). \"A Comprehensive Study of Convergent and Commutative Replicated Data Types\". INRIA Technical Report RR-7506.",
    "secondary_sources": [
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 5. O'Reilly Media.",
      "Almeida, P.S., Shoker, A. & Baquero, C. (2016). \"Delta State Replicated Data Types\". Journal of Parallel and Distributed Computing, 111.",
      "Nair, S. & Zawirski, M. (2015). \"An Introduction to Conflict-Free Replicated Data Types\". INRIA Research Report."
    ],
    "typed_relations": [
      {
        "slug": "eventual-consistency",
        "type": "complement"
      },
      {
        "slug": "gossip-protocol",
        "type": "complement"
      },
      {
        "slug": "consensus-protocols",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 266,
    "name": "Gossip Protocol",
    "name_zh": "Gossip 协议",
    "slug": "gossip-epidemic-protocol",
    "category": "distributed",
    "desc": "Epidemic-style information dissemination achieving reliable cluster-wide propagation without central coordination",
    "desc_zh": "无需中央协调、以流行病方式实现可靠集群范围信息传播的协议",
    "steps": [
      "Define the message payload and propagation goal: determine what information each node must spread (membership updates, failure detections, configuration changes) and the convergence time target",
      "Select a gossip variant: push gossip (sender initiates and pushes state to random peers), pull gossip (receiver queries random peers for their state), or push-pull (exchange in both directions per round)",
      "Configure fanout and interval: choose the number of peers contacted per gossip round (fanout) and the interval between rounds; fanout of log(N) provides reliable propagation in O(log N) rounds",
      "Implement failure detection integration: combine gossip with a SWIM-style failure detector that piggybacks liveness probes on gossip messages to spread node up/down status with low overhead",
      "Add entropy reduction: use Merkle trees or digest-based comparison so nodes only exchange differing state, preventing redundant full-state transfers as cluster size grows"
    ],
    "steps_zh": [
      "定义消息载荷和传播目标：确定每个节点必须传播的信息（成员更新、故障检测、配置变更）和收敛时间目标",
      "选择 gossip 变体：推送 gossip（发送方主动将状态推送给随机对等节点），拉取 gossip（接收方查询随机对等节点的状态），或推拉（每轮双向交换）",
      "配置扇出和间隔：选择每轮 gossip 联系的对等节点数（扇出）和轮次间隔；log(N) 的扇出可在 O(log N) 轮内可靠传播",
      "集成故障检测：将 gossip 与 SWIM 风格的故障检测器结合，将活性探测搭载在 gossip 消息上传播节点上线/下线状态，开销极低",
      "添加熵减少机制：使用 Merkle 树或基于摘要的比较，使节点只交换不同的状态，防止集群规模增长时产生冗余的全状态传输"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Message Payload",
      "Gossip Variant",
      "Fanout Interval",
      "Failure Detect",
      "Entropy Reduce"
    ],
    "viz_labels_zh": [
      "消息内容",
      "Gossip变体",
      "扇出间隔",
      "故障检测",
      "熵减少"
    ],
    "related": [
      "eventual-consistency",
      "crdts",
      "service-discovery",
      "consensus-protocols"
    ],
    "tags": [
      "gossip",
      "epidemic-protocol",
      "decentralized",
      "membership",
      "failure-detection"
    ],
    "origin_author": "Alan Demers",
    "origin_source": "Demers, A. et al. (1987). Epidemic Algorithms for Replicated Database Maintenance. Proceedings of PODC 1987; Das, A., Gupta, I. & Motivala, A. (2002). SWIM: Scalable Weakly-consistent Infection-style Process Group Membership Protocol. DSN 2002.",
    "origin_source_zh": "Demers等（1987）《复制数据库维护的流行病算法》PODC 1987；Das等（2002）《SWIM：可扩展弱一致感染式进程组成员协议》DSN 2002",
    "complexity": "intermediate",
    "when_to_use": [
      "When you need to propagate state or membership information across a large cluster without a central coordinator or single point of failure",
      "When eventual consistency of metadata is acceptable and O(log N) convergence time is sufficient",
      "When building a peer-to-peer overlay, distributed hash table, or service mesh that requires decentralized health and membership tracking",
      "When the network topology is dynamic with frequent node joins and leaves and you need a self-healing propagation mechanism"
    ],
    "when_to_use_zh": [
      "需要在大型集群中传播状态或成员信息而无需中央协调器或单点故障时",
      "元数据的最终一致性可接受且 O(log N) 收敛时间足够时",
      "构建需要去中心化健康和成员追踪的对等覆盖网络、分布式哈希表或服务网格时",
      "网络拓扑动态变化、节点频繁加入和离开且需要自愈传播机制时"
    ],
    "core_concepts": [
      "Epidemiological model: gossip borrows from SIR disease-spread models where nodes are Susceptible (haven't seen the message), Infected (spreading it), or Removed (stopped spreading after k rounds)",
      "Convergence bound: with fanout f and N nodes, gossip reaches all nodes in O(log N / log f) rounds with high probability, making it highly scalable without central infrastructure",
      "SWIM failure detector: Scalable Weakly-consistent Infection-style Membership uses gossip to disseminate node up/down events detected via direct and indirect pings, replacing expensive heartbeat floods",
      "Anti-entropy gossip: periodic full-state comparison between random pairs to repair divergence not covered by incremental gossip, typically using Merkle trees for efficient delta detection",
      "Rumor mongering vs. anti-entropy: rumor mongering spreads new updates quickly with low message count; anti-entropy repairs lingering divergence but requires more bandwidth per exchange"
    ],
    "core_concepts_zh": [
      "流行病学模型：gossip 借鉴 SIR 疾病传播模型，节点处于易感（未见过消息）、感染（正在传播）或移除（传播 k 轮后停止）状态",
      "收敛界：扇出 f 和 N 个节点时，gossip 以高概率在 O(log N / log f) 轮内到达所有节点，无需中央基础设施即可高度扩展",
      "SWIM 故障检测器：可扩展弱一致感染式成员协议使用 gossip 传播通过直接和间接 ping 检测到的节点上线/下线事件，取代昂贵的心跳泛洪",
      "反熵 gossip：随机对之间的定期全状态比较，修复增量 gossip 未覆盖的分歧，通常使用 Merkle 树进行高效增量检测",
      "谣言传播与反熵：谣言传播以低消息数快速传播新更新；反熵修复持续存在的分歧，但每次交换需要更多带宽"
    ],
    "timeline": [
      [
        "1987",
        "Alan Demers and colleagues at Xerox PARC publish 'Epidemic Algorithms for Replicated Database Maintenance', introducing the gossip dissemination model"
      ],
      [
        "2002",
        "Das, Gupta, and Motivala introduce SWIM at DSN 2002, combining gossip with efficient failure detection for large-scale membership protocols"
      ],
      [
        "2007",
        "Amazon Dynamo paper describes gossip-based membership and failure detection as core infrastructure for a planetary-scale key-value store"
      ],
      [
        "2014",
        "HashiCorp releases Serf, a standalone gossip-based membership and failure detection tool, making the pattern accessible as a library"
      ]
    ],
    "timeline_zh": [
      [
        "1987",
        "Alan Demers 及 Xerox PARC 同事发表「复制数据库维护的流行病算法」，引入 gossip 传播模型"
      ],
      [
        "2002",
        "Das、Gupta 和 Motivala 在 DSN 2002 发表 SWIM，将 gossip 与高效故障检测结合用于大规模成员协议"
      ],
      [
        "2007",
        "Amazon Dynamo 论文将基于 gossip 的成员和故障检测描述为行星规模键值存储的核心基础设施"
      ],
      [
        "2014",
        "HashiCorp 发布 Serf，一个独立的基于 gossip 的成员和故障检测工具，使该模式可作为库使用"
      ]
    ],
    "dos": [
      "Do tune fanout based on cluster size: use log(N) peers per round to balance convergence speed against message overhead, and increase fanout only when you need faster convergence",
      "Do combine gossip with a consistent hash ring or ring topology so nodes have a stable peer list to gossip with even during churn",
      "Do piggyback metadata on gossip messages (SWIM-style) to avoid separate heartbeat floods — this dramatically reduces per-node bandwidth consumption at scale",
      "Do implement message deduplication and seen-message filtering using bloom filters or a bounded history buffer to prevent infinite message amplification"
    ],
    "dos_zh": [
      "根据集群大小调整扇出：每轮使用 log(N) 个对等节点以平衡收敛速度和消息开销，只在需要更快收敛时才增加扇出",
      "将 gossip 与一致性哈希环或环形拓扑结合，使节点在节点流动时仍有稳定的对等列表可供 gossip",
      "在 gossip 消息上捎带元数据（SWIM 风格）以避免单独的心跳泛洪——这在规模上大幅减少每节点带宽消耗",
      "使用布隆过滤器或有界历史缓冲区实现消息去重和已见消息过滤，防止无限消息放大"
    ],
    "donts": [
      "Don't use gossip for data that requires strong consistency — gossip provides probabilistic dissemination with no delivery guarantee or ordering, only eventual convergence",
      "Don't gossip large payloads: gossip is designed for small state deltas and membership vectors; large messages negate the bandwidth advantage over broadcast",
      "Don't rely solely on gossip for failure detection in latency-sensitive paths — gossip convergence time means failure information may take multiple rounds to reach all nodes",
      "Don't neglect network partitions in gossip-based systems — a partition creates two gossip islands that diverge silently; reconciliation must be handled explicitly on partition heal"
    ],
    "donts_zh": [
      "不要将 gossip 用于需要强一致性的数据——gossip 提供概率性传播，没有投递保证或顺序保证，只有最终收敛",
      "不要 gossip 大负载：gossip 为小状态增量和成员向量设计；大消息抵消了相对广播的带宽优势",
      "不要仅依赖 gossip 进行延迟敏感路径的故障检测——gossip 收敛时间意味着故障信息可能需要多轮才能到达所有节点",
      "不要忽视基于 gossip 的系统中的网络分区——分区会创建两个静默分歧的 gossip 孤岛；分区修复时必须显式处理协调"
    ],
    "case_study_company": "Apache Cassandra",
    "case_study": "Apache Cassandra uses gossip as its primary mechanism for cluster membership, node health monitoring, and schema propagation. Every second, each node contacts up to three random peers to exchange endpoint state — including generation number, heartbeat version, load, schema version, and rack/datacenter topology. This gossip state is then used by the token ring to route requests correctly and by the hinted handoff system to detect which nodes are unavailable. When a new node bootstraps into a 1,000-node ring, it gossips its token ranges to a seed node, and that information propagates cluster-wide in approximately log(1000) ≈ 10 rounds — roughly 10 seconds — without any central registry.",
    "case_study_zh": "Apache Cassandra 使用 gossip 作为集群成员、节点健康监控和模式传播的主要机制。每秒，每个节点联系最多三个随机对等节点交换端点状态——包括代号、心跳版本、负载、模式版本和机架/数据中心拓扑。这些 gossip 状态被令牌环用于正确路由请求，也被提示移交系统用于检测不可用节点。当新节点引导加入 1000 节点的环时，它将令牌范围 gossip 给种子节点，该信息在大约 log(1000) ≈ 10 轮（约 10 秒）内在集群范围内传播——无需任何中央注册表。",
    "when_not_to_use": [
      "When strong consistency or guaranteed delivery ordering is required — use consensus protocols or message queues with ordering guarantees instead",
      "Very small clusters (fewer than 5 nodes) where the overhead of gossip exceeds the benefit and simple broadcast or direct notification is more efficient",
      "Latency-critical control planes where failure detection must happen in milliseconds rather than gossip convergence time"
    ],
    "when_not_to_use_zh": [
      "当需要强一致性或保证投递顺序时——改用共识协议或具有顺序保证的消息队列",
      "非常小的集群（少于 5 个节点），gossip 的开销超过收益，简单广播或直接通知更高效",
      "故障检测必须在毫秒内完成而非 gossip 收敛时间的延迟敏感控制平面"
    ],
    "adopters": [
      "Apache Cassandra",
      "Amazon DynamoDB",
      "HashiCorp Consul",
      "Riak",
      "Bitcoin / blockchain P2P networks"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Demers, A. et al. (1987). \"Epidemic Algorithms for Replicated Database Maintenance\". Proceedings of the 6th Annual ACM Symposium on Principles of Distributed Computing (PODC).",
    "secondary_sources": [
      "Das, A., Gupta, I. & Motivala, A. (2002). \"SWIM: Scalable Weakly-consistent Infection-style Process Group Membership Protocol\". Proceedings of DSN 2002.",
      "van Renesse, R. et al. (1998). \"Astrolabe: A Robust and Scalable Technology for Distributed System Monitoring\". ACM TOCS.",
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 5. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "eventual-consistency",
        "type": "complement"
      },
      {
        "slug": "crdts",
        "type": "complement"
      },
      {
        "slug": "service-discovery",
        "type": "complement"
      }
    ]
  },
  {
    "id": 267,
    "name": "Service Discovery Pattern",
    "name_zh": "服务发现模式",
    "slug": "service-discovery-pattern",
    "category": "distributed",
    "desc": "DNS-based and registry-based mechanisms for services to locate each other dynamically in elastic infrastructure",
    "desc_zh": "服务在弹性基础设施中动态定位彼此的基于 DNS 和注册表的机制",
    "steps": [
      "Choose a discovery strategy: client-side discovery (client queries registry and load-balances itself), server-side discovery (router or load balancer queries registry on behalf of client), or DNS-based (services resolved via DNS SRV records)",
      "Implement service registration: on startup, each service instance registers its address, port, health check endpoint, and metadata (version, region, tags) with the service registry",
      "Configure health checking: define liveness and readiness probes that the registry polls at a configured interval; failing checks cause the registry to deregister the instance and stop routing traffic to it",
      "Consume the registry in clients: use a discovery client library (Consul client, Eureka client, Kubernetes service DNS) to resolve logical service names to physical addresses, applying load balancing on the returned set",
      "Handle deregistration and graceful shutdown: ensure instances deregister before terminating; implement a grace period so in-flight requests complete before the instance is removed from the registry"
    ],
    "steps_zh": [
      "选择发现策略：客户端侧发现（客户端查询注册表并自行负载均衡）、服务器侧发现（路由器或负载均衡器代表客户端查询注册表），或基于 DNS（通过 DNS SRV 记录解析服务）",
      "实现服务注册：启动时，每个服务实例将其地址、端口、健康检查端点和元数据（版本、区域、标签）注册到服务注册表",
      "配置健康检查：定义注册表按配置间隔轮询的活性和就绪探针；检查失败会导致注册表注销实例并停止向其路由流量",
      "在客户端消费注册表：使用发现客户端库（Consul 客户端、Eureka 客户端、Kubernetes 服务 DNS）将逻辑服务名解析为物理地址，对返回集应用负载均衡",
      "处理注销和优雅关闭：确保实例在终止前注销；实现宽限期使进行中的请求在实例从注册表移除前完成"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Discovery Strategy",
      "Registration",
      "Health Check",
      "Client Consume",
      "Graceful Shutdown"
    ],
    "viz_labels_zh": [
      "发现策略",
      "服务注册",
      "健康检查",
      "客户端消费",
      "优雅下线"
    ],
    "related": [
      "service-discovery",
      "gossip-protocol",
      "circuit-breaker-with-retry",
      "sidecar-pattern"
    ],
    "tags": [
      "service-discovery",
      "microservices",
      "load-balancing",
      "registry",
      "cloud-native"
    ],
    "origin_author": "Chris Richardson",
    "origin_source": "Richardson, C. (2018). Microservices Patterns, Ch. 11. Manning Publications; Fowler, M. & Lewis, J. (2014). Microservices. martinfowler.com.",
    "origin_source_zh": "Richardson（2018）《微服务架构设计模式》第11章，Manning；Fowler与Lewis（2014）《微服务》martinfowler.com",
    "complexity": "intermediate",
    "when_to_use": [
      "When running microservices on dynamic cloud infrastructure where instance IPs change on every restart, scaling event, or deployment",
      "When you have multiple service instances behind a load balancer and need clients to discover healthy instances without hard-coded configuration",
      "When deploying to container orchestration platforms (Kubernetes, Nomad, ECS) that natively provide service discovery primitives",
      "When different versions of a service must be routed to different clients during canary or blue-green deployments"
    ],
    "when_to_use_zh": [
      "在动态云基础设施上运行微服务，实例 IP 在每次重启、扩缩容或部署时都会改变时",
      "在负载均衡器后有多个服务实例，需要客户端在没有硬编码配置的情况下发现健康实例时",
      "部署到原生提供服务发现原语的容器编排平台（Kubernetes、Nomad、ECS）时",
      "在金丝雀或蓝绿部署期间，不同版本的服务需要路由到不同客户端时"
    ],
    "core_concepts": [
      "Service registry: a centralized or distributed database (Consul, Eureka, etcd, ZooKeeper) that stores the network locations of all available service instances with their metadata and health status",
      "Client-side discovery: the consuming service queries the registry directly and applies its own load balancing algorithm; provides flexibility but couples every client to the registry API",
      "Server-side discovery: a load balancer or API gateway queries the registry and routes traffic; decouples clients from the registry but adds a central routing hop",
      "Self-registration vs. third-party registration: in self-registration, instances register on startup; in third-party registration, an orchestrator (Kubernetes, Registrator) handles registration and deregistration",
      "DNS-SD: DNS Service Discovery uses SRV records to encode service name, protocol, port, and priority, enabling DNS resolvers to serve as the service registry without custom client libraries"
    ],
    "core_concepts_zh": [
      "服务注册表：存储所有可用服务实例网络位置及其元数据和健康状态的集中式或分布式数据库（Consul、Eureka、etcd、ZooKeeper）",
      "客户端侧发现：消费服务直接查询注册表并应用自己的负载均衡算法；灵活但将每个客户端与注册表 API 耦合",
      "服务器侧发现：负载均衡器或 API 网关查询注册表并路由流量；将客户端与注册表解耦，但增加了中央路由跳转",
      "自注册与第三方注册：自注册时，实例在启动时自行注册；第三方注册时，编排器（Kubernetes、Registrator）处理注册和注销",
      "DNS-SD：DNS 服务发现使用 SRV 记录编码服务名、协议、端口和优先级，使 DNS 解析器无需自定义客户端库即可充当服务注册表"
    ],
    "timeline": [
      [
        "2012",
        "Netflix open-sources Eureka, a REST-based service registry for their AWS-based microservices, establishing the self-registration pattern"
      ],
      [
        "2013",
        "HashiCorp releases Consul, combining service discovery with health checking, key-value storage, and multi-datacenter support"
      ],
      [
        "2014",
        "Kubernetes introduces kube-dns and Services as a first-class service discovery abstraction, making DNS-based discovery the cloud-native default"
      ],
      [
        "2017",
        "Istio service mesh shifts service discovery into the data plane via Envoy sidecar proxies, abstracting registry interaction away from application code"
      ]
    ],
    "timeline_zh": [
      [
        "2012",
        "Netflix 开源 Eureka，一个用于其 AWS 微服务的基于 REST 的服务注册表，建立了自注册模式"
      ],
      [
        "2013",
        "HashiCorp 发布 Consul，将服务发现与健康检查、键值存储和多数据中心支持结合"
      ],
      [
        "2014",
        "Kubernetes 引入 kube-dns 和 Services 作为一级服务发现抽象，使基于 DNS 的发现成为云原生默认方案"
      ],
      [
        "2017",
        "Istio 服务网格通过 Envoy sidecar 代理将服务发现转移到数据平面，将注册表交互从应用代码中抽象出去"
      ]
    ],
    "dos": [
      "Do implement health checks at the application level, not just the process level, so the registry deregisters instances that are running but not serving traffic correctly",
      "Do cache discovery results locally with a short TTL so that a registry outage does not immediately cascade into a full service outage",
      "Do tag instances with version, environment, and capability metadata so consumers can implement fine-grained routing without separate configuration",
      "Do test deregistration and failover paths in staging by deliberately crashing instances and verifying traffic shifts to healthy instances within your SLA"
    ],
    "dos_zh": [
      "在应用层而非仅进程层实现健康检查，使注册表能注销正在运行但未正确提供流量的实例",
      "使用短 TTL 在本地缓存发现结果，使注册表中断不会立即级联为完整服务中断",
      "用版本、环境和能力元数据标记实例，使消费者无需单独配置即可实现细粒度路由",
      "在预发布环境中通过故意崩溃实例并验证流量在 SLA 内切换到健康实例来测试注销和故障转移路径"
    ],
    "donts": [
      "Don't hard-code service IP addresses or ports in configuration files — this defeats the purpose of service discovery and creates drift as instances change",
      "Don't set health check intervals too long (more than 10 seconds) because stale registry entries cause traffic to be routed to failed instances for extended periods",
      "Don't run a single-node service registry without HA configuration — the registry itself must be highly available or it becomes a single point of failure for the entire service mesh",
      "Don't skip connection draining before deregistration — abrupt removal causes in-flight requests to fail with connection reset errors visible to end users"
    ],
    "donts_zh": [
      "不要在配置文件中硬编码服务 IP 地址或端口——这违背了服务发现的目的，并在实例变化时造成配置漂移",
      "不要将健康检查间隔设置过长（超过 10 秒），因为过期的注册表条目会导致流量在较长时间内路由到故障实例",
      "不要在没有高可用配置的情况下运行单节点服务注册表——注册表本身必须高可用，否则它会成为整个服务网格的单点故障",
      "不要在注销前跳过连接排空——突然移除会导致进行中的请求以对最终用户可见的连接重置错误失败"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix operates Eureka, their home-built service registry, across their global microservices fleet of thousands of instances. Every service registers with its regional Eureka cluster on startup and sends heartbeats every 30 seconds. Eureka's self-preservation mode is a particularly notable design: when it detects that more than 15% of expected heartbeats are missing (indicating a network partition rather than mass failure), it stops evicting instances from the registry rather than cascading into a blank slate. This design decision kept Netflix services discoverable and partially functional during several major AWS networking events where a registry that aggressively evicted instances would have caused complete service unavailability.",
    "case_study_zh": "Netflix 在其数千个实例的全球微服务集群上运行自建的服务注册表 Eureka。每个服务在启动时向其区域 Eureka 集群注册，并每 30 秒发送一次心跳。Eureka 的自我保护模式是一个特别值得注意的设计：当它检测到超过 15% 的预期心跳缺失时（表明是网络分区而非大规模故障），它会停止从注册表中驱逐实例，而不是级联清空注册表。这一设计决策使 Netflix 服务在几次重大 AWS 网络事件中保持可发现性和部分可用性——若注册表激进驱逐实例，将导致完全不可用。",
    "when_not_to_use": [
      "Small-scale deployments with a fixed set of services on static infrastructure where DNS entries or configuration files are sufficient and the registry adds operational complexity without benefit",
      "Monolithic applications where all components run in the same process and inter-component communication is in-process, not network-based",
      "Serverless or FaaS architectures where the platform handles routing transparently and service discovery is abstracted away by the function runtime"
    ],
    "when_not_to_use_zh": [
      "静态基础设施上具有固定服务集的小规模部署，DNS 条目或配置文件已足够，注册表只增加运维复杂性而无收益",
      "所有组件在同一进程中运行、组件间通信是进程内而非基于网络的单体应用",
      "平台透明处理路由、服务发现由函数运行时抽象的无服务器或 FaaS 架构"
    ],
    "adopters": [
      "Netflix (Eureka)",
      "HashiCorp Consul users",
      "Kubernetes (kube-dns + Services)",
      "Spring Cloud Gateway",
      "Istio service mesh"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "scalability"
    ],
    "maturity_ring": "established",
    "primary_source": "Richardson, C. (2018). \"Microservices Patterns\", Ch. 11: Discovering services. Manning Publications.",
    "secondary_sources": [
      "Newman, S. (2021). \"Building Microservices\", 2nd ed., Ch. 10. O'Reilly Media.",
      "Burns, B. et al. (2016). \"Borg, Omega, and Kubernetes\". ACM Queue.",
      "Fowler, M. & Lewis, J. (2014). \"Microservices\". martinfowler.com."
    ],
    "typed_relations": [
      {
        "slug": "service-discovery",
        "type": "alternative"
      },
      {
        "slug": "circuit-breaker-with-retry",
        "type": "complement"
      },
      {
        "slug": "sidecar-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 268,
    "name": "Sidecar Pattern",
    "name_zh": "Sidecar 容器模式",
    "slug": "sidecar-container-pattern",
    "category": "distributed",
    "desc": "Deploying helper containers alongside the primary service container to handle cross-cutting concerns without modifying application code",
    "desc_zh": "在主服务容器旁部署辅助容器，无需修改应用代码即可处理横切关注点",
    "steps": [
      "Identify cross-cutting concerns to offload: select capabilities that apply to many services (TLS termination, service mesh proxying, log shipping, metrics collection, config sync) and are better handled outside application code",
      "Package the sidecar as a separate container image: the sidecar should be independently versioned, replaceable, and maintained by a different team (e.g., platform team) than the business logic container",
      "Co-locate in a shared Pod or task: deploy the sidecar and primary container as a unit (Kubernetes Pod, ECS task group) so they share network namespace, localhost access, and optionally volumes",
      "Configure communication via localhost: because sidecar and primary share network namespace, the primary calls the sidecar on localhost (e.g., Envoy proxy on 127.0.0.1:15001) with zero network hop overhead",
      "Implement lifecycle coordination: ensure the sidecar starts before or concurrently with the primary container, handles graceful shutdown in the correct order, and does not block primary container liveness checks"
    ],
    "steps_zh": [
      "识别要卸载的横切关注点：选择适用于许多服务的能力（TLS 终止、服务网格代理、日志传输、指标收集、配置同步），这些能力在应用代码外处理更好",
      "将 sidecar 打包为单独的容器镜像：sidecar 应独立版本控制、可替换，并由与业务逻辑容器不同的团队（如平台团队）维护",
      "在共享 Pod 或任务中共同部署：将 sidecar 和主容器作为一个单元部署（Kubernetes Pod、ECS 任务组），共享网络命名空间、localhost 访问和可选的卷",
      "通过 localhost 配置通信：因为 sidecar 和主容器共享网络命名空间，主容器通过 localhost 调用 sidecar（如 127.0.0.1:15001 上的 Envoy 代理），零网络跳转开销",
      "实现生命周期协调：确保 sidecar 在主容器之前或同时启动，以正确顺序处理优雅关闭，且不阻塞主容器的活性检查"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Cross-Cutting",
      "Sidecar Image",
      "Co-locate Pod",
      "Localhost Comm",
      "Lifecycle Coord"
    ],
    "viz_labels_zh": [
      "横切关注点",
      "独立镜像",
      "共同部署",
      "本地通信",
      "生命周期协调"
    ],
    "related": [
      "sidecar-pattern",
      "service-discovery-pattern",
      "circuit-breaker-with-retry",
      "twelve-factor-app"
    ],
    "tags": [
      "sidecar",
      "service-mesh",
      "containers",
      "kubernetes",
      "cross-cutting-concerns"
    ],
    "origin_author": "Microsoft Azure",
    "origin_source": "Microsoft Azure Architecture Center. Sidecar Pattern. docs.microsoft.com/azure/architecture/patterns/sidecar; Burns, B. (2018). Designing Distributed Systems. O'Reilly Media.",
    "origin_source_zh": "Microsoft Azure 架构中心《Sidecar 模式》docs.microsoft.com；Burns（2018）《设计分布式系统》O'Reilly",
    "complexity": "intermediate",
    "when_to_use": [
      "When multiple heterogeneous services written in different languages need the same cross-cutting capabilities (logging, tracing, mTLS) without duplicating code across each service",
      "When you want platform teams to own and upgrade infrastructure concerns (service mesh proxy, secrets agent) independently of application release cycles",
      "When adopting a service mesh (Istio, Linkerd) that requires an Envoy or linkerd-proxy sidecar injected into each pod",
      "When running legacy applications that cannot be modified to add observability or security features but can be wrapped by a sidecar in a container"
    ],
    "when_to_use_zh": [
      "当多个用不同语言编写的异构服务需要相同的横切能力（日志、追踪、mTLS）而不在每个服务中重复代码时",
      "当你希望平台团队独立于应用发布周期拥有和升级基础设施关注点（服务网格代理、密钥代理）时",
      "当采用需要向每个 Pod 注入 Envoy 或 linkerd-proxy sidecar 的服务网格（Istio、Linkerd）时",
      "当运行无法修改以添加可观测性或安全功能但可以在容器中被 sidecar 包装的遗留应用时"
    ],
    "core_concepts": [
      "Network namespace sharing: containers in the same Kubernetes Pod share a network namespace, allowing the sidecar to intercept all inbound and outbound traffic via iptables rules without application-level changes",
      "Transparent proxy injection: service meshes like Istio inject the Envoy sidecar via a mutating admission webhook, automatically installing the proxy for every new Pod without developer action",
      "Ambassador sub-pattern: the sidecar acts as an outbound proxy, handling retries, circuit breaking, and load balancing on behalf of the application — the application makes simple calls to localhost",
      "Adapter sub-pattern: the sidecar transforms the primary container's output (log format, metrics format) into a standard format consumed by the platform — decoupling the app from the platform contract",
      "Init container sequencing: Kubernetes init containers run before the main container and can pre-configure the sidecar or inject configuration secrets before the application starts receiving traffic"
    ],
    "core_concepts_zh": [
      "网络命名空间共享：同一 Kubernetes Pod 中的容器共享网络命名空间，允许 sidecar 通过 iptables 规则拦截所有入站和出站流量，无需应用层变更",
      "透明代理注入：Istio 等服务网格通过 mutating admission webhook 注入 Envoy sidecar，自动为每个新 Pod 安装代理，无需开发者操作",
      "Ambassador 子模式：sidecar 充当出站代理，代表应用处理重试、熔断和负载均衡——应用只对 localhost 进行简单调用",
      "Adapter 子模式：sidecar 将主容器的输出（日志格式、指标格式）转换为平台消费的标准格式——将应用与平台契约解耦",
      "Init 容器排序：Kubernetes init 容器在主容器之前运行，可在应用开始接收流量之前预配置 sidecar 或注入配置密钥"
    ],
    "timeline": [
      [
        "2016",
        "Lyft engineers build and deploy the first version of Envoy as a sidecar proxy for their service mesh, solving polyglot observability and resilience at scale"
      ],
      [
        "2017",
        "Microsoft publishes the Sidecar Pattern in the Azure Architecture Center, formalizing the three sub-patterns: Ambassador, Adapter, and Sidecar"
      ],
      [
        "2017",
        "Google, IBM, and Lyft release Istio 0.1, making automatic sidecar injection via Envoy the default service mesh deployment model for Kubernetes"
      ],
      [
        "2019",
        "Kubernetes introduces native sidecar container support proposals to address startup ordering and graceful shutdown race conditions in production deployments"
      ]
    ],
    "timeline_zh": [
      [
        "2016",
        "Lyft 工程师构建并部署第一版 Envoy 作为其服务网格的 sidecar 代理，解决多语言环境下的可观测性和弹性问题"
      ],
      [
        "2017",
        "Microsoft 在 Azure 架构中心发布 Sidecar 模式，正式化三个子模式：Ambassador、Adapter 和 Sidecar"
      ],
      [
        "2017",
        "Google、IBM 和 Lyft 发布 Istio 0.1，使通过 Envoy 自动注入 sidecar 成为 Kubernetes 的默认服务网格部署模型"
      ],
      [
        "2019",
        "Kubernetes 引入原生 sidecar 容器支持提案，解决生产部署中的启动顺序和优雅关闭竞争条件"
      ]
    ],
    "dos": [
      "Do ensure the sidecar and primary container have independent resource limits so a runaway sidecar cannot starve the application of CPU or memory",
      "Do version and release the sidecar image independently from the application image so platform teams can roll out security patches without coordinating application releases",
      "Do handle sidecar startup order explicitly using init containers or readiness gates to prevent the primary service from accepting traffic before the proxy is ready",
      "Do test sidecar failure modes independently: verify that the primary container behaves gracefully when the sidecar crashes, restarts, or is temporarily unavailable"
    ],
    "dos_zh": [
      "确保 sidecar 和主容器有独立的资源限制，使失控的 sidecar 无法耗尽应用的 CPU 或内存",
      "独立于应用镜像版本控制和发布 sidecar 镜像，使平台团队可以在不协调应用发布的情况下推出安全补丁",
      "使用 init 容器或就绪门明确处理 sidecar 启动顺序，防止主服务在代理就绪前接受流量",
      "独立测试 sidecar 故障模式：验证当 sidecar 崩溃、重启或临时不可用时主容器的优雅行为"
    ],
    "donts": [
      "Don't put business logic in the sidecar — it should contain only infrastructure concerns; mixing application logic into the sidecar defeats the separation of concerns that makes the pattern valuable",
      "Don't ignore the CPU and memory overhead of the sidecar — each Envoy proxy in a large service mesh consumes 50-100 MB and measurable CPU, which multiplies across thousands of pods",
      "Don't couple the sidecar version lifecycle to the application lifecycle — if upgrades are coordinated, you lose the ability to patch infrastructure independently",
      "Don't use the sidecar pattern for single-service deployments without a clear growth plan — the added complexity is only justified when the cross-cutting concern spans multiple services"
    ],
    "donts_zh": [
      "不要在 sidecar 中放置业务逻辑——它应只包含基础设施关注点；将应用逻辑混入 sidecar 会破坏使该模式有价值的关注点分离",
      "不要忽视 sidecar 的 CPU 和内存开销——大型服务网格中每个 Envoy 代理消耗 50-100 MB 和可测量的 CPU，这在数千个 Pod 中累积",
      "不要将 sidecar 版本生命周期与应用生命周期耦合——如果升级需要协调，你就失去了独立修补基础设施的能力",
      "不要在没有明确增长计划的单服务部署中使用 sidecar 模式——只有当横切关注点跨越多个服务时，增加的复杂性才合理"
    ],
    "case_study_company": "Airbnb",
    "case_study": "Airbnb migrated their service-to-service communication to an Envoy-based sidecar mesh called SmartStack 2.0 (Synapse + Nerve replaced by Envoy). Prior to the migration, each service team had to maintain custom retry logic, timeouts, and circuit breakers in application code. After injection of Envoy sidecars across their Kubernetes fleet, these policies were centralized in xDS configuration managed by the platform team. During a 2019 incident, the platform team was able to adjust timeout policies across 200 services simultaneously by pushing a single xDS config update — a change that would have required 200 separate deployments under the previous approach.",
    "case_study_zh": "Airbnb 将其服务间通信迁移到名为 SmartStack 2.0 的基于 Envoy 的 sidecar 网格（Synapse + Nerve 被 Envoy 取代）。迁移前，每个服务团队必须在应用代码中维护自定义的重试逻辑、超时和熔断器。在其 Kubernetes 集群中注入 Envoy sidecar 后，这些策略集中在由平台团队管理的 xDS 配置中。在 2019 年的一次事故中，平台团队能够通过推送单个 xDS 配置更新同时调整 200 个服务的超时策略——在之前的方式下，这需要 200 次独立部署。",
    "when_not_to_use": [
      "Single-process monoliths where all services share a process and cross-cutting concerns are better addressed by middleware or aspect-oriented programming",
      "Serverless functions where the execution model is event-driven and ephemeral, making a long-running sidecar process impractical and cost-inefficient",
      "Resource-constrained edge or IoT deployments where the memory and CPU overhead of running an additional proxy container is prohibitive"
    ],
    "when_not_to_use_zh": [
      "所有服务共享进程的单进程单体应用，横切关注点通过中间件或面向方面编程更好地处理",
      "执行模型是事件驱动和短暂的无服务器函数，运行长期 sidecar 进程不切实际且成本效率低下",
      "内存和 CPU 开销使运行额外代理容器不可行的资源受限边缘或 IoT 部署"
    ],
    "adopters": [
      "Airbnb (Envoy sidecar mesh)",
      "Lyft (Envoy originator)",
      "Google (Istio + Envoy)",
      "Microsoft (Dapr sidecar runtime)",
      "Uber (M3 metrics via sidecar)"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability",
      "observability"
    ],
    "maturity_ring": "established",
    "primary_source": "Microsoft Azure Architecture Center. \"Sidecar Pattern\". docs.microsoft.com/azure/architecture/patterns/sidecar.",
    "secondary_sources": [
      "Burns, B. (2018). \"Designing Distributed Systems\". O'Reilly Media.",
      "Newman, S. (2021). \"Building Microservices\", 2nd ed., Ch. 13. O'Reilly Media.",
      "Calcote, L. & Butcher, Z. (2019). \"Istio: Up and Running\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "sidecar-pattern",
        "type": "alternative"
      },
      {
        "slug": "service-discovery-pattern",
        "type": "complement"
      },
      {
        "slug": "circuit-breaker-with-retry",
        "type": "complement"
      }
    ]
  },
  {
    "id": 329,
    "name": "Publish-Subscribe Pattern",
    "name_zh": "发布-订阅模式",
    "slug": "publish-subscribe-pattern",
    "category": "distributed",
    "desc": "Decoupled messaging pattern where publishers emit events to named topics and subscribers receive only the messages matching their subscriptions, eliminating direct coupling between producers and consumers",
    "desc_zh": "解耦的消息传递模式，发布者将事件发送到命名主题，订阅者仅接收与其订阅匹配的消息，消除生产者与消费者之间的直接耦合",
    "steps": [
      "Define topics (or channels/subjects): identify the discrete event types or data streams in your system and give each a named topic; keep topics semantically narrow so subscribers can subscribe to exactly the events they care about",
      "Implement publishers: decouple business logic from transport by having producers publish a message envelope (event type, payload, timestamp, correlation ID) to the broker topic without knowing which, if any, subscribers exist",
      "Configure the message broker: choose a broker that matches your delivery guarantees (Kafka for ordered, durable, at-least-once; RabbitMQ/fanout for transient fire-and-forget; Redis Pub/Sub for in-memory low-latency); set retention, replication, and partition counts appropriate to your throughput",
      "Register subscribers: each consumer declares its subscription to one or more topics, specifying a consumer group if competing consumers share load, or independent subscriptions if each consumer needs a full copy of every message",
      "Handle delivery semantics: implement idempotent message processing in each subscriber so that at-least-once delivery does not corrupt state; use message deduplication keys or consumer offset tracking for exactly-once processing where required"
    ],
    "steps_zh": [
      "定义主题（或通道/科目）：识别系统中的离散事件类型或数据流，并为每个主题命名；保持主题语义上的狭窄性，以便订阅者可以精确订阅他们关心的事件",
      "实现发布者：通过让生产者向代理主题发布消息信封（事件类型、载荷、时间戳、关联 ID）来将业务逻辑与传输层解耦，无需知道是否存在订阅者",
      "配置消息代理：选择符合你的投递保证的代理（Kafka 用于有序、持久、至少一次；RabbitMQ/fanout 用于瞬时即发即弃；Redis Pub/Sub 用于内存低延迟）；设置适合你吞吐量的保留期、副本数和分区数",
      "注册订阅者：每个消费者声明其对一个或多个主题的订阅，若竞争消费者共享负载则指定消费者组，若每个消费者需要每条消息的完整副本则使用独立订阅",
      "处理投递语义：在每个订阅者中实现幂等消息处理，使至少一次投递不会破坏状态；在需要精确一次处理时，使用消息去重键或消费者偏移量跟踪"
    ],
    "ai_relevant": true,
    "viz_type": "tree",
    "viz_labels": [
      "Publisher",
      "Topic",
      "Broker",
      "Subscriber",
      "Delivery Semantics"
    ],
    "viz_labels_zh": [
      "发布者",
      "主题",
      "消息代理",
      "订阅者",
      "投递语义"
    ],
    "related": [
      "outbox-pattern",
      "backpressure-pattern",
      "cqrs-pattern",
      "gossip-protocol"
    ],
    "tags": [
      "messaging",
      "event-driven",
      "decoupling",
      "asynchronous",
      "scalability"
    ],
    "origin_author": "CORBA Event Service specification (OMG, 1994); popularized through JMS (Java Message Service, 1998) and academic distributed systems literature of the 1980s–1990s",
    "origin_source": "Birman, K. & Joseph, T. (1987). \"Reliable Communication in the Presence of Failures\". ACM TOCS 5(1); OMG (1994). \"CORBAservices: Common Object Services Specification\"; Eugster, P. et al. (2003). \"The Many Faces of Publish/Subscribe\". ACM Computing Surveys 35(2).",
    "origin_source_zh": "Birman 与 Joseph（1987）《存在故障时的可靠通信》，ACM TOCS 5(1)；OMG（1994）《CORBA 服务：公共对象服务规范》；Eugster 等（2003）《发布/订阅的多种形式》，ACM 计算机调查 35(2)",
    "complexity": "intermediate",
    "abstraction_level": "system",
    "quality_concerns": [
      "scalability",
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "when_to_use": [
      "When multiple independent services need to react to the same domain events without creating tight coupling between producer and consumer codebases",
      "When event fan-out is required — a single business event (OrderPlaced) must trigger actions in billing, inventory, notifications, and analytics simultaneously",
      "When producers and consumers have different scaling characteristics and must be independently deployed and scaled",
      "When building event-driven architectures that require audit trails, event replay, or temporal decoupling where consumers can process messages at their own pace",
      "When integrating heterogeneous systems (legacy and modern) that cannot share a direct API contract but can agree on a message schema"
    ],
    "when_to_use_zh": [
      "当多个独立服务需要响应相同领域事件，但不希望在生产者和消费者代码库之间产生紧耦合时",
      "当需要事件扇出时——单个业务事件（OrderPlaced）必须同时触发账单、库存、通知和分析中的操作",
      "当生产者和消费者有不同的扩缩容特性，必须独立部署和扩缩容时",
      "当构建需要审计追踪、事件重播或时间解耦的事件驱动架构，消费者可以按自己的节奏处理消息时",
      "当集成无法共享直接 API 合约但能就消息模式达成一致的异构系统（遗留系统和现代系统）时"
    ],
    "core_concepts": [
      "Topic (channel/subject): a named logical stream to which publishers write and from which subscribers read; topics act as the coordination point without either side knowing about the other",
      "Broker: the intermediary infrastructure (Kafka, RabbitMQ, SNS, Redis) that receives published messages, stores or routes them, and delivers them to matching subscribers",
      "Consumer group: a set of subscriber instances that collectively consume a topic's messages, each message delivered to exactly one member of the group — enabling horizontal scaling of message processing",
      "Delivery guarantees: at-most-once (fire-and-forget), at-least-once (acknowledge after processing), and exactly-once (idempotent processing or transactional consumers) — each involves different trade-offs in latency, complexity, and correctness",
      "Message schema and versioning: the contract between publisher and subscriber encoded as a message schema (JSON Schema, Avro, Protobuf); schema evolution must be handled carefully to avoid breaking existing consumers"
    ],
    "core_concepts_zh": [
      "主题（通道/科目）：发布者写入、订阅者读取的命名逻辑流；主题充当协调点，双方均不了解对方",
      "代理：接收发布消息、存储或路由消息并将其投递给匹配订阅者的中间基础设施（Kafka、RabbitMQ、SNS、Redis）",
      "消费者组：集体消费主题消息的一组订阅者实例，每条消息恰好投递给组内的一个成员——实现消息处理的水平扩展",
      "投递保证：至多一次（即发即弃）、至少一次（处理后确认）和精确一次（幂等处理或事务型消费者）——每种都涉及延迟、复杂性和正确性之间的不同权衡",
      "消息模式与版本控制：编码为消息模式（JSON Schema、Avro、Protobuf）的发布者与订阅者之间的契约；模式演进必须谨慎处理以避免破坏现有消费者"
    ],
    "timeline": [
      [
        "1987",
        "Birman and Joseph publish research on reliable event notification in distributed systems at Cornell, laying theoretical groundwork for pub/sub messaging"
      ],
      [
        "1994",
        "OMG publishes the CORBA Event Service specification, the first widely-adopted formalization of the publish-subscribe pattern in distributed object systems"
      ],
      [
        "1998",
        "Sun Microsystems releases the Java Message Service (JMS) 1.0 specification, standardizing pub/sub and point-to-point messaging APIs for Java enterprise applications"
      ],
      [
        "2011",
        "LinkedIn open-sources Apache Kafka, a distributed commit log reimagining pub/sub with durable ordered partitions, enabling both real-time streaming and historical event replay at internet scale"
      ]
    ],
    "timeline_zh": [
      [
        "1987",
        "Birman 和 Joseph 在康奈尔大学发表分布式系统中可靠事件通知的研究，为发布-订阅消息传递奠定理论基础"
      ],
      [
        "1994",
        "OMG 发布 CORBA 事件服务规范，这是分布式对象系统中发布-订阅模式的首个被广泛采用的正式定义"
      ],
      [
        "1998",
        "Sun Microsystems 发布 Java 消息服务（JMS）1.0 规范，为 Java 企业应用标准化发布-订阅和点对点消息传递 API"
      ],
      [
        "2011",
        "LinkedIn 开源 Apache Kafka，这是一个以持久有序分区重新构想发布-订阅的分布式提交日志，在互联网规模上实现实时流处理和历史事件重播"
      ]
    ],
    "dos": [
      "Do design topics around business events rather than technical operations — name topics after domain facts (order.placed, payment.processed) so consumers understand the business intent without reading publisher code",
      "Do implement idempotent consumers: assign each message a unique ID and check for duplicates before processing, because all practical pub/sub systems deliver at-least-once and re-delivery is inevitable during failures or consumer restarts",
      "Do version your message schemas explicitly and adopt a compatibility strategy (backward, forward, or full) using a schema registry (Confluent Schema Registry, AWS Glue) to prevent silent data corruption as schemas evolve",
      "Do set retention policies and dead-letter topics: configure appropriate message TTL and a dead-letter queue to capture unprocessable messages so they can be inspected and replayed without blocking healthy message flow"
    ],
    "dos_zh": [
      "围绕业务事件而非技术操作设计主题——以领域事实命名主题（order.placed、payment.processed），使消费者无需阅读发布者代码即可理解业务意图",
      "实现幂等消费者：为每条消息分配唯一 ID 并在处理前检查重复，因为所有实际的发布-订阅系统都至少投递一次，在故障或消费者重启期间重新投递是不可避免的",
      "明确为消息模式添加版本并采用兼容性策略（向后、向前或完全兼容），使用模式注册表（Confluent Schema Registry、AWS Glue）防止模式演进时发生无声数据损坏",
      "设置保留策略和死信主题：配置适当的消息 TTL 和死信队列以捕获无法处理的消息，使其可以在不阻塞正常消息流的情况下被检查和重播"
    ],
    "donts": [
      "Don't use pub/sub for request-reply interactions that require a synchronous response — the pattern is fundamentally asynchronous; use gRPC or REST for low-latency request/response and reserve pub/sub for fire-and-forget event notification",
      "Don't put large payloads directly in messages: message brokers are optimized for routing metadata, not bulk data transfer; store large objects in S3 or a database and include only a reference (claim-check pattern) in the message",
      "Don't create a single giant topic for all events — coarse-grained topics force consumers to receive and filter messages they don't need, wasting bandwidth and increasing processing cost; design topics to match consumer subscription granularity",
      "Don't ignore message ordering requirements: partitioned brokers like Kafka guarantee order only within a partition; if consumers require global ordering across all events, a single partition eliminates the scalability benefit of the pattern"
    ],
    "donts_zh": [
      "不要将发布-订阅用于需要同步响应的请求-回复交互——该模式本质上是异步的；对低延迟请求/响应使用 gRPC 或 REST，将发布-订阅保留用于即发即弃的事件通知",
      "不要将大负载直接放在消息中：消息代理针对路由元数据而非批量数据传输进行了优化；将大对象存储在 S3 或数据库中，在消息中仅包含引用（声明检查模式）",
      "不要为所有事件创建单一的巨型主题——粗粒度主题迫使消费者接收和过滤不需要的消息，浪费带宽并增加处理成本；设计主题以匹配消费者订阅粒度",
      "不要忽略消息顺序要求：Kafka 等分区代理仅在分区内保证顺序；如果消费者需要跨所有事件的全局顺序，单分区会消除该模式的可扩展性优势"
    ],
    "case_study_company": "LinkedIn",
    "case_study": "LinkedIn built Apache Kafka in 2010–2011 to solve a pub/sub problem at scale: each day, over 1 billion events (page views, searches, ad clicks, profile updates) needed to flow from dozens of producer services to separate consumer systems including Hadoop batch jobs, real-time monitoring, and the social graph update pipeline. The existing ActiveMQ-based system could not sustain the throughput without overwhelming brokers. Kafka's design — append-only partitioned log with consumer-controlled offsets — allowed each consumer group to maintain its own read position independently, enabling the news feed processor, the search indexer, and the analytics pipeline to all consume the same activity stream topics at different speeds and independently replay messages during failures. LinkedIn reported processing over 7 trillion messages per day on Kafka by 2015, a figure that grew to hundreds of trillions per day across the industry as Kafka became the de-facto standard for high-throughput pub/sub.",
    "case_study_zh": "LinkedIn 在 2010–2011 年构建了 Apache Kafka 以解决规模化的发布-订阅问题：每天超过 10 亿个事件（页面浏览、搜索、广告点击、个人资料更新）需要从数十个生产者服务流向独立的消费者系统，包括 Hadoop 批处理作业、实时监控和社交图谱更新管道。现有的基于 ActiveMQ 的系统无法在不压垮代理的情况下维持吞吐量。Kafka 的设计——带有消费者控制偏移量的仅追加分区日志——使每个消费者组能够独立维护自己的读取位置，使新闻推送处理器、搜索索引器和分析管道能够以不同速度消费相同的活动流主题，并在故障期间独立重播消息。LinkedIn 报告称 2015 年在 Kafka 上每天处理超过 7 万亿条消息，随着 Kafka 成为高吞吐量发布-订阅的事实标准，这一数字在整个行业增长到每天数百万亿条。",
    "when_not_to_use": [
      "Simple two-service integrations with no fan-out need: if only one consumer ever reads a producer's output, the broker overhead adds latency and operational complexity with no decoupling benefit over a direct API call",
      "Latency-sensitive synchronous workflows where the caller must block for an immediate response — pub/sub introduces queuing latency that makes it unsuitable for user-facing request/response paths under strict SLA requirements",
      "When message ordering across multiple topics is required — pub/sub brokers provide ordering within a partition or topic, not across topics, making it unsuitable for workflows requiring a guaranteed global event sequence",
      "Very small-scale systems where a shared database, a simple task queue, or direct service calls would suffice — pub/sub adds broker infrastructure, schema management, and consumer group coordination that may not be justified below a certain scale"
    ],
    "when_not_to_use_zh": [
      "没有扇出需求的简单双服务集成：如果只有一个消费者读取生产者的输出，代理开销会增加延迟和运营复杂性，相比直接 API 调用没有解耦优势",
      "调用方必须阻塞等待立即响应的延迟敏感同步工作流——发布-订阅引入的排队延迟使其不适合在严格 SLA 要求下的面向用户请求/响应路径",
      "当需要跨多个主题的消息顺序时——发布-订阅代理在分区或主题内提供顺序，而非跨主题，不适合需要保证全局事件序列的工作流",
      "非常小规模的系统，共享数据库、简单任务队列或直接服务调用就足够了——发布-订阅增加了代理基础设施、模式管理和消费者组协调，在一定规模以下可能不合理"
    ],
    "adopters": [
      "Apache Kafka (LinkedIn, Confluent)",
      "RabbitMQ (Pivotal/VMware)",
      "AWS SNS/SQS",
      "Google Cloud Pub/Sub",
      "Redis Pub/Sub"
    ],
    "primary_source": "Eugster, P., Felber, P., Guerraoui, R. & Kermarrec, A-M. (2003). \"The Many Faces of Publish/Subscribe\". ACM Computing Surveys, 35(2), 114–131.",
    "secondary_sources": [
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 11: Stream Processing. O'Reilly Media.",
      "Narkhede, N., Shapira, G. & Palino, T. (2017). \"Kafka: The Definitive Guide\". O'Reilly Media.",
      "Hohpe, G. & Woolf, B. (2003). \"Enterprise Integration Patterns\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "outbox-pattern",
        "type": "complement"
      },
      {
        "slug": "backpressure-pattern",
        "type": "complement"
      },
      {
        "slug": "cqrs-pattern",
        "type": "complement"
      },
      {
        "slug": "saga-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 332,
    "name": "Retry Pattern",
    "name_zh": "重试模式",
    "slug": "retry-pattern",
    "category": "distributed",
    "desc": "Automatically re-attempt a failed operation with configurable delay and limits to handle transient faults",
    "desc_zh": "对失败操作以可配置的延迟和次数限制自动重试，以应对瞬时故障",
    "steps": [
      "Classify the error: determine whether the failure is transient (network blip, throttling, temporary unavailability) or permanent (bad request, not found) — only retry transient errors",
      "Choose a retry strategy: select immediate retry for very short blips, fixed-delay retry for simple backoff, or exponential backoff with jitter to reduce thundering-herd pressure on recovering services",
      "Set a retry budget: define a maximum attempt count and a total timeout ceiling so that retry loops cannot run indefinitely or cascade into overall request timeouts",
      "Add jitter to delays: introduce randomised variance (±30%) into wait intervals so that multiple clients retrying simultaneously do not synchronise and overwhelm a recovering service",
      "Log and surface retry metrics: emit a counter for each retry attempt and an event on final failure so operations teams can distinguish transient noise from systemic degradation"
    ],
    "steps_zh": [
      "对错误分类：判断故障是瞬时性的（网络抖动、限流、临时不可用）还是永久性的（请求错误、资源不存在）——只对瞬时性错误重试",
      "选择重试策略：对极短抖动使用立即重试，对简单退避使用固定延迟重试，或使用带抖动的指数退避以减轻对恢复中服务的羊群压力",
      "设置重试预算：定义最大重试次数和总超时上限，防止重试循环无限运行或级联导致整体请求超时",
      "为延迟添加抖动：在等待间隔中引入随机变化（±30%），防止多个客户端同步重试并压垮正在恢复的服务",
      "记录并暴露重试指标：为每次重试发出计数器、为最终失败发出事件，使运维团队能区分瞬时噪声与系统性降级"
    ],
    "ai_relevant": false,
    "viz_type": "cycle",
    "viz_labels": [
      "Transient Error",
      "Retry Strategy",
      "Retry Budget",
      "Jitter",
      "Retry Metrics"
    ],
    "viz_labels_zh": [
      "瞬时错误",
      "重试策略",
      "重试预算",
      "抖动",
      "重试指标"
    ],
    "related": [
      "circuit-breaker-pattern",
      "circuit-breaker-with-retry",
      "bulkhead-pattern"
    ],
    "tags": [
      "retry",
      "resilience",
      "fault-tolerance",
      "transient-faults"
    ],
    "origin_author": "Distributed systems best practice; formalized by Microsoft Azure patterns, 2014",
    "origin_source": "Cloud Design Patterns: Prescriptive Architecture Guidance for Cloud Applications (Microsoft patterns & practices, 2014)",
    "origin_source_zh": "《云设计模式：云应用规范架构指南》（微软 patterns & practices，2014）",
    "complexity": "beginner",
    "when_to_use": [
      "Calling remote services over a network where transient failures (timeouts, connection resets, 503s) are expected and recoverable",
      "Integrating with third-party APIs that enforce rate limits and return 429 Too Many Requests responses",
      "Accessing cloud-managed resources (databases, object stores, message brokers) that may experience brief unavailability during maintenance or scaling",
      "Any idempotent operation where repeating the call produces the same outcome and does not create duplicate side-effects"
    ],
    "when_to_use_zh": [
      "通过网络调用远程服务，预期会出现可恢复的瞬时故障（超时、连接重置、503）时",
      "与执行速率限制并返回 429 Too Many Requests 响应的第三方 API 集成时",
      "访问可能在维护或扩展期间短暂不可用的云托管资源（数据库、对象存储、消息代理）时",
      "任何幂等操作，重复调用产生相同结果且不会产生重复副作用时"
    ],
    "core_concepts": [
      "Transient fault detection: distinguishing recoverable errors (timeouts, throttling, 5xx) from permanent failures (4xx, schema errors) that should never be retried",
      "Exponential backoff: doubling the wait interval between attempts so that retries back off progressively and reduce load on a stressed service",
      "Jitter: randomising delay intervals so that many concurrent clients do not synchronise their retries and create a thundering herd",
      "Retry budget: a hard cap on attempt count and elapsed time that guarantees the retry loop terminates and preserves overall system latency",
      "Idempotency: the property of an operation that it can be executed multiple times without changing the result beyond the first successful execution"
    ],
    "core_concepts_zh": [
      "瞬时故障检测：区分可恢复错误（超时、限流、5xx）与永久性故障（4xx、模式错误），永久性故障不应重试",
      "指数退避：在每次尝试之间将等待间隔加倍，使重试逐渐退避并降低对压力服务的负载",
      "抖动：随机化延迟间隔，防止大量并发客户端同步重试引发羊群效应",
      "重试预算：对尝试次数和已用时间设置硬性上限，保证重试循环终止并维持整体系统延迟",
      "幂等性：操作可多次执行而不改变首次成功执行之外结果的特性"
    ],
    "timeline": [
      [
        "1990s",
        "TCP/IP and RPC middleware incorporate automatic retransmission and retry semantics as foundational reliability mechanisms"
      ],
      [
        "2008",
        "Amazon and Google publish internal guidelines on exponential backoff with jitter for AWS and GCP service clients"
      ],
      [
        "2014",
        "Microsoft formalizes the Retry pattern in 'Cloud Design Patterns', giving the practice a canonical name and taxonomy"
      ],
      [
        "2016",
        "Resilience libraries (Polly, Resilience4j, Spring Retry) make configurable retry pipelines a first-class concern in every major language ecosystem"
      ]
    ],
    "timeline_zh": [
      [
        "1990年代",
        "TCP/IP 和 RPC 中间件将自动重传与重试语义作为基础可靠性机制纳入"
      ],
      [
        "2008",
        "亚马逊和谷歌发布内部指南，为 AWS 和 GCP 服务客户端规定带抖动的指数退避"
      ],
      [
        "2014",
        "微软在《云设计模式》中正式化重试模式，为该实践赋予规范名称和分类体系"
      ],
      [
        "2016",
        "弹性库（Polly、Resilience4j、Spring Retry）使可配置重试管道成为各主流语言生态系统的一等关注点"
      ]
    ],
    "dos": [
      "Do retry only on transient, idempotent operations because retrying non-idempotent calls risks duplicate side-effects such as duplicate charges or duplicate messages",
      "Do use exponential backoff with jitter because it reduces thundering-herd load on recovering services far better than fixed-interval retry",
      "Do cap total retry duration as well as attempt count because unbounded retries can cause cascading timeouts across dependent services",
      "Do pair retry with a circuit breaker so that after a threshold of consecutive failures the circuit opens and stops retrying until the downstream recovers"
    ],
    "dos_zh": [
      "仅对瞬时性、幂等操作重试，因为重试非幂等调用有产生重复副作用（如重复扣款或重复消息）的风险",
      "使用带抖动的指数退避，因为它比固定间隔重试更能降低恢复中服务的羊群压力",
      "同时限制重试总时长和尝试次数，因为无限制重试可能导致跨依赖服务的级联超时",
      "将重试与熔断器配合使用，在连续失败达到阈值后打开熔断，停止重试直至下游恢复"
    ],
    "donts": [
      "Don't retry non-idempotent writes (e.g., POST create-order) without first verifying idempotency keys because duplicate processing causes data corruption",
      "Don't use fixed-interval retry without jitter in high-concurrency systems because all clients will synchronise retries and amplify load on an already-struggling service",
      "Don't set an unlimited retry count because it masks systemic failures and starves the caller of resources while the downstream remains unavailable",
      "Don't retry on 4xx client errors (bad request, unauthorised, not found) because these indicate a logic problem that more retries cannot fix"
    ],
    "donts_zh": [
      "不要在未验证幂等键的情况下重试非幂等写入（如 POST 创建订单），因为重复处理会导致数据损坏",
      "不要在高并发系统中使用无抖动的固定间隔重试，因为所有客户端会同步重试，对已经压力较大的服务雪上加霜",
      "不要设置无限重试次数，因为这会掩盖系统性故障，并在下游持续不可用期间耗尽调用方资源",
      "不要对 4xx 客户端错误（错误请求、未授权、未找到）进行重试，因为这些表示更多重试无法修复的逻辑问题"
    ],
    "case_study_company": "AWS SDK",
    "case_study": "Every AWS SDK implements the Retry pattern with exponential backoff and jitter as its default error-handling strategy. When a DynamoDB call returns a ProvisionedThroughputExceededException, the SDK automatically retries up to three times using decorrelated jitter backoff. This behaviour is documented in the AWS architecture blog post 'Exponential Backoff And Jitter' (2015), which showed that decorrelated jitter reduced completion time by 28% compared to a naive fixed-interval retry under contention. The SDK's configurable RetryMode (legacy, standard, adaptive) lets teams tune the retry budget to match their application's latency tolerance, demonstrating how a well-implemented retry policy is transparent to application code yet dramatically improves resilience.",
    "case_study_zh": "每个 AWS SDK 都将带指数退避和抖动的重试模式作为默认错误处理策略。当 DynamoDB 调用返回 ProvisionedThroughputExceededException 时，SDK 会自动使用解相关抖动退避最多重试三次。这一行为记录在 AWS 架构博客文章《指数退避与抖动》（2015）中，文章显示在竞争条件下，解相关抖动比朴素固定间隔重试将完成时间缩短了 28%。SDK 可配置的 RetryMode（传统、标准、自适应）让团队能够根据应用的延迟容忍度调整重试预算，展示了一个实现良好的重试策略对应用代码透明，却能显著提升弹性。",
    "when_not_to_use": [
      "Non-idempotent operations that create side-effects on each call (e.g., payment processing, email sending) unless an idempotency key mechanism is in place",
      "Permanent error conditions such as authentication failures, invalid input, or resource-not-found where retrying wastes resources without any chance of recovery",
      "Real-time latency-sensitive paths (e.g., live audio/video streams, HFT order execution) where the added latency of even one retry exceeds acceptable response-time budgets",
      "When the downstream service is clearly overloaded and not recovering — use a circuit breaker to halt all calls rather than continuing to retry and amplifying the overload"
    ],
    "when_not_to_use_zh": [
      "每次调用都会产生副作用的非幂等操作（如支付处理、发送电子邮件），除非已有幂等键机制",
      "永久性错误条件，如身份验证失败、无效输入或资源不存在，重试会浪费资源且没有恢复可能",
      "实时延迟敏感路径（如直播音视频流、高频交易订单执行），即使一次重试的附加延迟也超出可接受响应时间预算",
      "当下游服务明显过载且未恢复时——使用熔断器停止所有调用，而非继续重试放大过载"
    ],
    "adopters": [
      "AWS SDK",
      "Azure SDK",
      "Spring Retry",
      "Polly (.NET)",
      "Resilience4j"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Microsoft patterns & practices team (2014). \"Cloud Design Patterns: Prescriptive Architecture Guidance for Cloud Applications\". Microsoft Press. Retry Pattern chapter.",
    "secondary_sources": [
      "Brooker, M. (2015). \"Exponential Backoff And Jitter\". AWS Architecture Blog. aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/",
      "Nygard, M. (2007). \"Release It! Design and Deploy Production-Ready Software\". Pragmatic Bookshelf.",
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 8: The Trouble with Distributed Systems. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "circuit-breaker-pattern",
        "type": "complement"
      },
      {
        "slug": "bulkhead-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 131,
    "name": "GraphQL Schema Design",
    "name_zh": "GraphQL 模式设计",
    "slug": "graphql-schema-design",
    "category": "api",
    "desc": "Query language and type system for APIs enabling precise data fetching",
    "desc_zh": "用于API的查询语言与类型系统，支持精确数据获取",
    "steps": [
      "Define the domain model as a strongly-typed schema using SDL (Schema Definition Language) with clear object types, enums, and interfaces",
      "Design queries around client use-cases rather than mirroring database tables, grouping related fields into cohesive types",
      "Implement resolvers for each field, ensuring they follow the single-responsibility principle and delegate to data sources",
      "Add pagination (cursor-based or offset), filtering, and sorting arguments to list fields following the Relay Connection specification",
      "Validate the schema against client requirements, document all types with descriptions, and publish the schema for frontend teams"
    ],
    "steps_zh": [
      "使用SDL（模式定义语言）定义领域模型为强类型模式，包含清晰的对象类型、枚举和接口",
      "围绕客户端用例而非数据库表镜像来设计查询，将相关字段分组为内聚的类型",
      "为每个字段实现解析器，确保遵循单一职责原则并委托给数据源",
      "按照Relay连接规范为列表字段添加分页（游标或偏移）、过滤和排序参数",
      "根据客户端需求验证模式，为所有类型添加描述文档，并向前端团队发布模式"
    ],
    "ai_relevant": true,
    "viz_type": "tree",
    "viz_labels": [
      "Schema Types",
      "Client Queries",
      "Resolvers",
      "Pagination",
      "Documentation"
    ],
    "viz_labels_zh": [
      "类型定义",
      "客户端查询",
      "解析器",
      "分页",
      "文档"
    ],
    "related": [
      "bff-pattern",
      "api-gateway-pattern",
      "openapi-specification"
    ],
    "tags": [
      "graphql",
      "schema",
      "type-system",
      "query-language"
    ],
    "origin_author": "Lee Byron, Dan Schafer, Nick Schrock / Facebook, 2015",
    "origin_source": "GraphQL specification (spec.graphql.org); referenced in API Design Patterns (JJ Geewax, 2021, Ch. 1)",
    "origin_source_zh": "GraphQL规范（spec.graphql.org）；参见《API设计模式》（JJ Geewax，2021，第1章）",
    "complexity": "intermediate",
    "when_to_use": [
      "When multiple client types (web, mobile, IoT) need different data shapes from the same API",
      "When frontend teams suffer from over-fetching or under-fetching with REST endpoints",
      "When the API aggregates data from multiple microservices and clients need a unified query interface",
      "When rapid frontend iteration requires schema evolution without coordinating backend deployments"
    ],
    "when_to_use_zh": [
      "当多种客户端类型（Web、移动端、IoT）需要从同一API获取不同形状的数据时",
      "当前端团队因REST端点的过度获取或不足获取而苦恼时",
      "当API聚合多个微服务的数据且客户端需要统一查询接口时",
      "当前端快速迭代需要模式演进而无需协调后端部署时"
    ],
    "core_concepts": [
      "Type System: Every field and argument has a declared type, enabling compile-time validation and auto-generated documentation",
      "Declarative Data Fetching: Clients specify exactly the fields they need in a single request, eliminating over-fetching and under-fetching",
      "Resolver Architecture: Each field in the schema maps to a resolver function that knows how to fetch or compute its value from any data source",
      "Introspection: The schema is self-documenting; clients can query the schema itself to discover available types, fields, and relationships",
      "Schema-First Design: The schema serves as a contract between frontend and backend teams, enabling parallel development"
    ],
    "core_concepts_zh": [
      "类型系统：每个字段和参数都有声明的类型，支持编译时验证和自动生成文档",
      "声明式数据获取：客户端在单次请求中精确指定所需字段，消除过度获取和不足获取",
      "解析器架构：模式中的每个字段映射到一个解析器函数，该函数知道如何从任意数据源获取或计算其值",
      "自省能力：模式自带文档功能，客户端可查询模式本身以发现可用的类型、字段和关系",
      "模式优先设计：模式作为前后端团队之间的契约，支持并行开发"
    ],
    "timeline": [
      [
        "2012",
        "Facebook internally develops GraphQL to solve mobile newsfeed data fetching challenges"
      ],
      [
        "2015",
        "Facebook open-sources GraphQL specification and reference implementation (graphql-js)"
      ],
      [
        "2016",
        "GitHub launches its public GraphQL API (v4), validating GraphQL for large-scale production use"
      ],
      [
        "2018",
        "Apollo Federation introduced, enabling GraphQL composition across microservices"
      ],
      [
        "2021",
        "GraphQL becomes an industry standard; The GraphQL Foundation under Linux Foundation stewards the spec"
      ]
    ],
    "timeline_zh": [
      [
        "2012",
        "Facebook内部开发GraphQL以解决移动端新闻流数据获取挑战"
      ],
      [
        "2015",
        "Facebook开源GraphQL规范和参考实现（graphql-js）"
      ],
      [
        "2016",
        "GitHub推出公共GraphQL API（v4），验证了GraphQL在大规模生产中的可行性"
      ],
      [
        "2018",
        "Apollo Federation发布，支持跨微服务的GraphQL组合"
      ],
      [
        "2021",
        "GraphQL成为行业标准；Linux基金会下的GraphQL基金会负责规范管理"
      ]
    ],
    "dos": [
      "Do design types around business domain concepts rather than database tables because it decouples the API from storage implementation",
      "Do use DataLoader or equivalent batching to solve the N+1 query problem because naive resolvers will degrade performance exponentially",
      "Do add field-level descriptions to the schema because introspection-driven tools depend on them for developer experience",
      "Do implement query complexity analysis and depth limiting because unbounded queries can be weaponized for denial-of-service"
    ],
    "dos_zh": [
      "围绕业务领域概念而非数据库表设计类型，因为这将API与存储实现解耦",
      "使用DataLoader或等效的批处理来解决N+1查询问题，因为朴素的解析器会导致性能指数级下降",
      "在模式中为字段添加描述，因为基于自省的工具依赖这些描述来提供开发者体验",
      "实现查询复杂度分析和深度限制，因为无界查询可能被利用进行拒绝服务攻击"
    ],
    "donts": [
      "Don't expose database IDs directly as GraphQL IDs because it leaks implementation details and makes ID migration painful",
      "Don't create deeply nested types without pagination because clients can construct exponentially expensive queries",
      "Don't treat GraphQL mutations as simple REST replacements because mutations should model domain actions, not CRUD operations",
      "Don't skip schema versioning strategy because breaking changes to live schemas will disrupt all connected clients"
    ],
    "donts_zh": [
      "不要将数据库ID直接暴露为GraphQL ID，因为这泄露了实现细节并使ID迁移变得困难",
      "不要创建没有分页的深度嵌套类型，因为客户端可以构造指数级昂贵的查询",
      "不要将GraphQL变更简单地视为REST替代品，因为变更应建模领域行为而非CRUD操作",
      "不要忽略模式版本策略，因为对线上模式的破坏性变更将影响所有已连接的客户端"
    ],
    "case_study_company": "GitHub",
    "case_study": "GitHub migrated from REST API v3 to GraphQL API v4 in 2016 to address the problem of clients needing dozens of REST calls to render a single page. With GraphQL, mobile and web clients could request exactly the data they needed in a single round-trip. GitHub reported that GraphQL reduced API response payload sizes by up to 90% for some endpoints, and the self-documenting schema eliminated the need for separate API documentation maintenance.",
    "case_study_zh": "GitHub于2016年从REST API v3迁移到GraphQL API v4，解决了客户端渲染单个页面需要发送数十个REST请求的问题。通过GraphQL，移动端和Web客户端可以在一次往返中精确请求所需数据。GitHub报告称GraphQL将某些端点的API响应负载大小减少了高达90%，且自文档化的模式消除了单独维护API文档的需要。",
    "when_not_to_use": [
      "Simple CRUD APIs with a single client type where REST is sufficient and GraphQL adds unnecessary complexity",
      "File upload or streaming scenarios where GraphQL's request-response model is a poor fit",
      "Public APIs where caching at the HTTP layer (CDN, reverse proxy) is critical, since GraphQL POST requests bypass standard HTTP caching",
      "Teams without frontend-backend collaboration maturity, where schema negotiation becomes a bottleneck"
    ],
    "when_not_to_use_zh": [
      "单一客户端类型的简单CRUD API，REST已经足够且GraphQL增加了不必要的复杂性",
      "文件上传或流式传输场景，GraphQL的请求-响应模型不适合",
      "HTTP层缓存（CDN、反向代理）至关重要的公共API，因为GraphQL POST请求绕过标准HTTP缓存",
      "前后端协作成熟度不足的团队，模式协商会成为瓶颈"
    ],
    "adopters": [
      "GitHub",
      "Shopify",
      "Facebook",
      "Twitter",
      "Airbnb"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "usability",
      "maintainability",
      "performance"
    ],
    "maturity_ring": "established",
    "primary_source": "GraphQL Foundation (2015). \"GraphQL Specification\". spec.graphql.org.",
    "secondary_sources": [
      "Byron, L. (2015). \"GraphQL: A Data Query Language\". Engineering at Meta Blog.",
      "Geewax, J.J. (2021). \"API Design Patterns\", Ch. 1. Manning Publications.",
      "Buna, S. (2021). \"GraphQL in Action\". Manning Publications."
    ],
    "typed_relations": [
      {
        "slug": "bff-pattern",
        "type": "complement"
      },
      {
        "slug": "api-gateway-pattern",
        "type": "complement"
      },
      {
        "slug": "openapi-specification",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 132,
    "name": "gRPC & Protocol Buffers",
    "name_zh": "gRPC与Protocol Buffers",
    "slug": "grpc-protocol-buffers",
    "category": "api",
    "desc": "High-performance RPC framework using HTTP/2 and binary serialization",
    "desc_zh": "基于HTTP/2和二进制序列化的高性能RPC框架",
    "steps": [
      "Define service contracts and message types in .proto files using Protocol Buffer IDL (Interface Definition Language)",
      "Generate client and server stubs from .proto files using the protoc compiler for your target languages",
      "Implement the server-side service logic by overriding the generated stub methods with business logic",
      "Configure gRPC channels with appropriate load balancing, TLS, and deadline/timeout policies",
      "Deploy and monitor the service using gRPC health checking protocol and observability interceptors for tracing and metrics"
    ],
    "steps_zh": [
      "使用Protocol Buffer IDL（接口定义语言）在.proto文件中定义服务契约和消息类型",
      "使用protoc编译器为目标语言从.proto文件生成客户端和服务端存根",
      "通过在生成的存根方法中实现业务逻辑来完成服务端逻辑",
      "配置gRPC通道，设置适当的负载均衡、TLS以及截止时间/超时策略",
      "使用gRPC健康检查协议和可观测性拦截器（追踪与指标）部署并监控服务"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Proto Contract",
      "Code Generate",
      "Server Logic",
      "Channel Config",
      "Monitor"
    ],
    "viz_labels_zh": [
      "Proto契约",
      "代码生成",
      "服务逻辑",
      "通道配置",
      "监控"
    ],
    "related": [
      "api-gateway-pattern",
      "consumer-driven-contracts",
      "asyncapi"
    ],
    "tags": [
      "grpc",
      "protobuf",
      "rpc",
      "http2",
      "binary-serialization"
    ],
    "origin_author": "Google, 2015 (built on internal Stubby RPC framework)",
    "origin_source": "gRPC: A High-Performance Open-Source Universal RPC Framework (grpc.io); referenced in Building Microservices (Sam Newman, 2nd ed., 2021, Ch. 4)",
    "origin_source_zh": "《gRPC：高性能开源通用RPC框架》（grpc.io）；参见《构建微服务》（Sam Newman，第2版，2021，第4章）",
    "complexity": "intermediate",
    "when_to_use": [
      "When low-latency, high-throughput inter-service communication is critical in a microservices architecture",
      "When services are built in multiple programming languages and need a shared contract with code generation",
      "When bidirectional streaming is required for real-time data flows between client and server",
      "When payload size and bandwidth efficiency matter, such as mobile clients on constrained networks"
    ],
    "when_to_use_zh": [
      "在微服务架构中需要低延迟、高吞吐量的服务间通信时",
      "当服务使用多种编程语言构建，需要带代码生成的共享契约时",
      "当需要双向流式传输用于客户端与服务端之间的实时数据流时",
      "当负载大小和带宽效率很重要时，如受限网络上的移动客户端"
    ],
    "core_concepts": [
      "Protocol Buffers: A language-neutral, platform-neutral binary serialization format that is 3-10x smaller and faster than JSON",
      "HTTP/2 Transport: Multiplexed streams over a single TCP connection with header compression, enabling concurrent RPCs without head-of-line blocking",
      "Code Generation: The protoc compiler generates type-safe client stubs and server interfaces from .proto files, eliminating manual serialization code",
      "Streaming Modes: Four communication patterns — unary, server streaming, client streaming, and bidirectional streaming — cover all real-time use cases",
      "Deadlines and Cancellation: Built-in propagation of timeouts and cancellation signals across the entire call chain prevents resource leaks in distributed systems"
    ],
    "core_concepts_zh": [
      "Protocol Buffers：语言无关、平台无关的二进制序列化格式，比JSON小3-10倍且速度更快",
      "HTTP/2传输：通过单个TCP连接上的多路复用流和头部压缩，实现并发RPC且无队头阻塞",
      "代码生成：protoc编译器从.proto文件生成类型安全的客户端存根和服务端接口，消除手动序列化代码",
      "流式模式：四种通信模式——一元、服务端流、客户端流和双向流——覆盖所有实时用例",
      "截止时间与取消：内置的超时和取消信号在整个调用链中传播，防止分布式系统中的资源泄漏"
    ],
    "timeline": [
      [
        "2001",
        "Google internally develops Stubby, the precursor to gRPC, for inter-datacenter communication"
      ],
      [
        "2015",
        "Google open-sources gRPC 1.0 with support for 10+ programming languages"
      ],
      [
        "2017",
        "gRPC becomes a Cloud Native Computing Foundation (CNCF) incubating project"
      ],
      [
        "2019",
        "gRPC-Web enables browser clients to call gRPC services via an Envoy proxy translation layer"
      ],
      [
        "2023",
        "gRPC adoption surges in AI/ML inference pipelines for low-latency model serving (e.g., TensorFlow Serving, Triton)"
      ]
    ],
    "timeline_zh": [
      [
        "2001",
        "Google内部开发Stubby——gRPC的前身，用于跨数据中心通信"
      ],
      [
        "2015",
        "Google开源gRPC 1.0，支持10多种编程语言"
      ],
      [
        "2017",
        "gRPC成为云原生计算基金会（CNCF）孵化项目"
      ],
      [
        "2019",
        "gRPC-Web使浏览器客户端能通过Envoy代理转换层调用gRPC服务"
      ],
      [
        "2023",
        "gRPC在AI/ML推理管线中的采用激增，用于低延迟模型服务（如TensorFlow Serving、Triton）"
      ]
    ],
    "dos": [
      "Do use proto3 syntax and follow the Protocol Buffer style guide because consistent naming and conventions simplify cross-team collaboration",
      "Do set explicit deadlines on every RPC call because missing deadlines cause cascading timeouts across the entire service mesh",
      "Do version .proto files carefully using reserved fields and field number preservation because binary compatibility depends on stable field numbers",
      "Do implement gRPC interceptors for cross-cutting concerns (auth, logging, tracing) because they provide a clean middleware pattern"
    ],
    "dos_zh": [
      "使用proto3语法并遵循Protocol Buffer风格指南，因为一致的命名和约定简化了跨团队协作",
      "为每次RPC调用设置明确的截止时间，因为缺失截止时间会在整个服务网格中引发级联超时",
      "使用保留字段和字段号保持来仔细管理.proto文件版本，因为二进制兼容性依赖于稳定的字段号",
      "为横切关注点（认证、日志、追踪）实现gRPC拦截器，因为它们提供了清晰的中间件模式"
    ],
    "donts": [
      "Don't expose gRPC directly to browser clients without a proxy because browsers cannot natively speak HTTP/2 trailers required by gRPC",
      "Don't use gRPC for public-facing APIs where developer ergonomics and curl-testability matter because JSON/REST is far more accessible",
      "Don't ignore backward compatibility when evolving .proto files because removing or renumbering fields breaks all existing clients",
      "Don't skip implementing health checks because orchestrators like Kubernetes depend on them for readiness and liveness probes"
    ],
    "donts_zh": [
      "不要在没有代理的情况下将gRPC直接暴露给浏览器客户端，因为浏览器不能原生支持gRPC所需的HTTP/2 trailers",
      "不要在开发者易用性和curl可测试性很重要的公共API中使用gRPC，因为JSON/REST的可访问性远高于此",
      "不要在演进.proto文件时忽视向后兼容性，因为删除或重编号字段会破坏所有现有客户端",
      "不要跳过健康检查的实现，因为Kubernetes等编排器依赖它们进行就绪和存活探测"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix migrated its inter-service communication from REST/JSON to gRPC in 2019 to reduce latency in its microservices architecture of over 700 services. The binary serialization of Protocol Buffers reduced payload sizes by approximately 60%, and HTTP/2 multiplexing eliminated the connection overhead that had plagued their REST-based service mesh. Netflix reported a 30-40% reduction in inter-service latency and significant CPU savings from avoiding JSON parsing at scale.",
    "case_study_zh": "Netflix于2019年将其超过700个微服务间的通信从REST/JSON迁移到gRPC，以降低延迟。Protocol Buffers的二进制序列化将负载大小减少了约60%，HTTP/2多路复用消除了困扰其REST服务网格的连接开销。Netflix报告服务间延迟降低了30-40%，并因避免大规模JSON解析而显著节省了CPU资源。",
    "when_not_to_use": [
      "Public developer-facing APIs where ease of testing with curl and browser tools is a priority",
      "Simple request-response services where the overhead of proto compilation and code generation is unjustified",
      "Environments where firewall or proxy infrastructure does not support HTTP/2",
      "Teams without polyglot service needs, where REST with OpenAPI provides sufficient contract enforcement"
    ],
    "when_not_to_use_zh": [
      "面向外部开发者的公共API，在这种场景中用curl和浏览器工具的易测试性是优先考虑的",
      "简单的请求-响应服务，proto编译和代码生成的开销不合理",
      "防火墙或代理基础设施不支持HTTP/2的环境",
      "不需要多语言服务的团队，REST加OpenAPI已提供足够的契约保障"
    ],
    "adopters": [
      "Google",
      "Netflix",
      "Square",
      "Cisco",
      "CoreOS"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "performance",
      "scalability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Google (2015). \"gRPC: A High-Performance, Open Source Universal RPC Framework\". grpc.io.",
    "secondary_sources": [
      "Newman, S. (2021). \"Building Microservices\", 2nd ed., Ch. 4. O'Reilly Media.",
      "Indrasiri, K. & Kuruppu, D. (2020). \"gRPC: Up and Running\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "api-gateway-pattern",
        "type": "complement"
      },
      {
        "slug": "consumer-driven-contracts",
        "type": "complement"
      },
      {
        "slug": "asyncapi",
        "type": "related"
      }
    ]
  },
  {
    "id": 133,
    "name": "API Gateway Pattern",
    "name_zh": "API网关模式",
    "slug": "api-gateway-pattern",
    "category": "api",
    "desc": "Single entry point that routes, aggregates, and secures microservices APIs",
    "desc_zh": "作为微服务API的单一入口点，负责路由、聚合和安全防护",
    "steps": [
      "Identify all client types and their API consumption patterns, then define the gateway's routing table mapping external paths to internal services",
      "Implement cross-cutting concerns at the gateway layer: authentication, rate limiting, request/response transformation, and TLS termination",
      "Configure request routing and load balancing rules, including path-based routing, header-based routing, and service discovery integration",
      "Add response aggregation for composite endpoints that combine data from multiple backend services into a single client response",
      "Set up monitoring, circuit breakers, and fallback responses at the gateway to prevent cascading failures from backend service outages"
    ],
    "steps_zh": [
      "识别所有客户端类型及其API消费模式，然后定义网关的路由表，将外部路径映射到内部服务",
      "在网关层实现横切关注点：认证、限流、请求/响应转换和TLS终止",
      "配置请求路由和负载均衡规则，包括基于路径的路由、基于头部的路由和服务发现集成",
      "为组合端点添加响应聚合，将多个后端服务的数据合并为单个客户端响应",
      "在网关设置监控、熔断器和降级响应，防止后端服务故障引发级联失败"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Routing Table",
      "Cross-Cutting",
      "Load Balance",
      "Aggregation",
      "Circuit Breaker"
    ],
    "viz_labels_zh": [
      "路由表",
      "横切关注点",
      "负载均衡",
      "响应聚合",
      "熔断"
    ],
    "related": [
      "bff-pattern",
      "api-rate-limiting-throttling",
      "graphql-schema-design"
    ],
    "tags": [
      "gateway",
      "routing",
      "aggregation",
      "microservices",
      "security"
    ],
    "origin_author": "Microservices community, formalized by Chris Richardson (~2015)",
    "origin_source": "Building Microservices (Sam Newman, 2nd ed., 2021, Ch. 7); API Design Patterns (JJ Geewax, 2021, Ch. 2)",
    "origin_source_zh": "《构建微服务》（Sam Newman，第2版，2021，第7章）；《API设计模式》（JJ Geewax，2021，第2章）",
    "complexity": "intermediate",
    "when_to_use": [
      "When a microservices architecture needs a unified entry point to decouple clients from internal service topology",
      "When cross-cutting concerns like authentication, logging, and rate limiting should be centralized rather than duplicated in each service",
      "When clients need aggregated responses from multiple backend services in a single API call",
      "When internal services use different protocols (gRPC, AMQP) and clients need a consistent HTTP/REST interface"
    ],
    "when_to_use_zh": [
      "当微服务架构需要统一入口点来解耦客户端与内部服务拓扑时",
      "当认证、日志和限流等横切关注点应集中处理而非在每个服务中重复时",
      "当客户端需要在单次API调用中获取多个后端服务的聚合响应时",
      "当内部服务使用不同协议（gRPC、AMQP）而客户端需要一致的HTTP/REST接口时"
    ],
    "core_concepts": [
      "Reverse Proxy: The gateway acts as a reverse proxy, hiding the internal service topology and providing a stable external API surface",
      "Request Routing: Incoming requests are routed to the appropriate backend service based on URL paths, headers, or query parameters",
      "Cross-Cutting Concerns: Authentication, authorization, rate limiting, logging, and CORS are handled once at the gateway instead of in each service",
      "Response Aggregation: The gateway can fan out a single client request to multiple services and compose their responses into one payload",
      "Protocol Translation: The gateway translates between external protocols (HTTP/REST) and internal protocols (gRPC, WebSocket, message queues)"
    ],
    "core_concepts_zh": [
      "反向代理：网关充当反向代理，隐藏内部服务拓扑并提供稳定的外部API表面",
      "请求路由：根据URL路径、头部或查询参数将传入请求路由到适当的后端服务",
      "横切关注点：认证、授权、限流、日志和CORS在网关统一处理，而非在每个服务中重复",
      "响应聚合：网关可将单个客户端请求扇出到多个服务，并将响应组合为一个负载",
      "协议转换：网关在外部协议（HTTP/REST）和内部协议（gRPC、WebSocket、消息队列）之间进行转换"
    ],
    "timeline": [
      [
        "2013",
        "Netflix Zuul emerges as one of the first dedicated API gateways for microservices"
      ],
      [
        "2015",
        "AWS API Gateway launches as a managed service, popularizing the gateway-as-a-service model"
      ],
      [
        "2017",
        "Kong (built on Nginx/OpenResty) becomes the dominant open-source API gateway"
      ],
      [
        "2019",
        "Envoy-based gateways (Ambassador, Gloo) bring Kubernetes-native API gateway capabilities"
      ],
      [
        "2022",
        "Gateway API specification in Kubernetes standardizes gateway configuration as a portable Kubernetes resource"
      ]
    ],
    "timeline_zh": [
      [
        "2013",
        "Netflix Zuul作为首批专用微服务API网关之一出现"
      ],
      [
        "2015",
        "AWS API Gateway作为托管服务推出，普及了网关即服务模式"
      ],
      [
        "2017",
        "Kong（基于Nginx/OpenResty构建）成为主流开源API网关"
      ],
      [
        "2019",
        "基于Envoy的网关（Ambassador、Gloo）带来Kubernetes原生API网关能力"
      ],
      [
        "2022",
        "Kubernetes中的Gateway API规范将网关配置标准化为可移植的Kubernetes资源"
      ]
    ],
    "dos": [
      "Do keep the gateway stateless and thin because a fat gateway with business logic becomes a distributed monolith",
      "Do implement circuit breakers at the gateway level because backend service failures should degrade gracefully rather than cascading to all clients",
      "Do use declarative configuration for routing rules because imperative gateway logic is hard to audit, test, and version",
      "Do deploy the gateway with high availability and auto-scaling because it is a single point of failure for all API traffic"
    ],
    "dos_zh": [
      "保持网关无状态和轻薄，因为包含业务逻辑的臃肿网关会变成分布式单体",
      "在网关层实现熔断器，因为后端服务故障应优雅降级而非级联到所有客户端",
      "使用声明式配置定义路由规则，因为命令式网关逻辑难以审计、测试和版本管理",
      "以高可用和自动扩缩方式部署网关，因为它是所有API流量的单点故障"
    ],
    "donts": [
      "Don't put business logic in the gateway because it creates tight coupling and turns the gateway into a distributed monolith",
      "Don't use a single gateway for all client types when they have vastly different needs — consider BFF pattern instead",
      "Don't skip monitoring gateway latency because the gateway adds a network hop and any performance degradation affects every request",
      "Don't hardcode service URLs in the gateway because it defeats the purpose of service discovery and dynamic routing"
    ],
    "donts_zh": [
      "不要在网关中放置业务逻辑，因为这会产生紧耦合并将网关变为分布式单体",
      "当不同客户端类型有截然不同的需求时不要使用单一网关——考虑BFF模式替代",
      "不要忽略监控网关延迟，因为网关增加了一跳网络延迟，任何性能下降都会影响每个请求",
      "不要在网关中硬编码服务URL，因为这违背了服务发现和动态路由的初衷"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix built Zuul as its API gateway to handle over 50 billion API requests per day across its streaming platform. Zuul provides dynamic routing, monitoring, resiliency, and security for over 1,000 backend microservices. By centralizing authentication, rate limiting, and canary routing at the gateway layer, Netflix reduced per-service boilerplate by thousands of lines of code and gained the ability to perform zero-downtime traffic shifts during deployments.",
    "case_study_zh": "Netflix构建了Zuul作为其API网关，处理流媒体平台每天超过500亿次API请求。Zuul为超过1000个后端微服务提供动态路由、监控、弹性和安全功能。通过在网关层集中认证、限流和金丝雀路由，Netflix减少了每个服务数千行的样板代码，并获得了在部署期间执行零停机流量切换的能力。",
    "when_not_to_use": [
      "Monolithic applications with a single backend where a gateway adds unnecessary latency and operational overhead",
      "Simple service-to-service communication where a service mesh (Istio, Linkerd) handles routing and resilience at the infrastructure layer",
      "Extremely latency-sensitive paths where even a single additional network hop is unacceptable",
      "Small teams with fewer than five services where the operational cost of maintaining a gateway exceeds its benefits"
    ],
    "when_not_to_use_zh": [
      "只有单个后端的单体应用，网关增加了不必要的延迟和运维开销",
      "简单的服务间通信，服务网格（Istio、Linkerd）已在基础设施层处理路由和弹性",
      "对延迟极度敏感的路径，即使多一跳网络延迟也不可接受",
      "服务数少于五个的小团队，维护网关的运维成本超过其收益"
    ],
    "adopters": [
      "Netflix",
      "Amazon",
      "Uber",
      "Alibaba",
      "Stripe"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "scalability",
      "security",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Richardson, C. (2018). \"Microservices Patterns: With Examples in Java\", Ch. 8. Manning Publications.",
    "secondary_sources": [
      "Newman, S. (2021). \"Building Microservices\", 2nd ed., Ch. 7. O'Reilly Media.",
      "Geewax, J.J. (2021). \"API Design Patterns\", Ch. 2. Manning Publications."
    ],
    "typed_relations": [
      {
        "slug": "bff-pattern",
        "type": "complement"
      },
      {
        "slug": "api-rate-limiting-throttling",
        "type": "complement"
      },
      {
        "slug": "graphql-schema-design",
        "type": "complement"
      }
    ]
  },
  {
    "id": 134,
    "name": "Backend for Frontend (BFF)",
    "name_zh": "服务于前端的后端（BFF）",
    "slug": "bff-pattern",
    "category": "api",
    "desc": "Dedicated backend services tailored to specific client types",
    "desc_zh": "为特定客户端类型量身定制的专用后端服务",
    "steps": [
      "Identify distinct client types (web SPA, mobile iOS/Android, third-party) and document their unique data and interaction requirements",
      "Create a dedicated BFF service for each client type that acts as an intermediary between the client and downstream microservices",
      "Implement client-specific data aggregation, transformation, and formatting in each BFF to optimize payloads for that client's constraints",
      "Have each client team own and maintain their respective BFF to ensure it evolves with the client's needs without cross-team coordination",
      "Keep BFFs thin: they should orchestrate calls to downstream services and shape responses, but never own business logic or persistent state"
    ],
    "steps_zh": [
      "识别不同的客户端类型（Web SPA、移动端iOS/Android、第三方），记录其独特的数据和交互需求",
      "为每种客户端类型创建专用的BFF服务，作为客户端和下游微服务之间的中间层",
      "在每个BFF中实现客户端特定的数据聚合、转换和格式化，优化该客户端约束条件下的负载",
      "让每个客户端团队拥有并维护其对应的BFF，确保它随客户端需求演进而无需跨团队协调",
      "保持BFF轻薄：它们应编排对下游服务的调用并塑造响应，但不应拥有业务逻辑或持久状态"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Client Types",
      "BFF Service",
      "Data Aggregate",
      "Team Ownership",
      "Thin Orchestrator"
    ],
    "viz_labels_zh": [
      "客户端类型",
      "BFF服务",
      "数据聚合",
      "团队所有权",
      "轻量编排"
    ],
    "related": [
      "api-gateway-pattern",
      "graphql-schema-design",
      "grpc-protocol-buffers"
    ],
    "tags": [
      "bff",
      "frontend",
      "microservices",
      "aggregation",
      "client-specific"
    ],
    "origin_author": "Sam Newman / ThoughtWorks, 2015",
    "origin_source": "Building Microservices (Sam Newman, 1st ed., 2015, Ch. 4; 2nd ed., 2021, Ch. 7)",
    "origin_source_zh": "《构建微服务》（Sam Newman，第1版，2015，第4章；第2版，2021，第7章）",
    "complexity": "intermediate",
    "when_to_use": [
      "When mobile and web clients need fundamentally different API response shapes from the same microservices",
      "When a single general-purpose API gateway has become bloated with client-specific logic",
      "When different client teams want independent release cadences for their API layer",
      "When mobile clients on constrained networks need aggressively optimized payloads compared to web clients"
    ],
    "when_to_use_zh": [
      "当移动端和Web客户端需要从相同微服务获取根本不同的API响应结构时",
      "当单一通用API网关因客户端特定逻辑而变得臃肿时",
      "当不同客户端团队希望其API层有独立的发布节奏时",
      "当受限网络上的移动客户端需要比Web客户端更积极的负载优化时"
    ],
    "core_concepts": [
      "Client Affinity: Each BFF is purpose-built for one client type, so it can optimize data shape, payload size, and protocol for that client's specific needs",
      "Team Ownership: The frontend team that consumes the BFF also owns it, eliminating cross-team API negotiation and enabling faster iteration",
      "Aggregation Layer: BFFs fan out requests to multiple downstream microservices and compose responses tailored to the client's UI structure",
      "Thin Orchestration: BFFs contain presentation logic and data shaping but delegate all business rules to downstream domain services",
      "Independent Deployment: Each BFF can be deployed, scaled, and versioned independently without affecting other client channels"
    ],
    "core_concepts_zh": [
      "客户端亲和性：每个BFF专为一种客户端类型构建，可针对该客户端的特定需求优化数据形状、负载大小和协议",
      "团队所有权：消费BFF的前端团队同时拥有它，消除跨团队API协商并实现更快迭代",
      "聚合层：BFF将请求扇出到多个下游微服务，并组合出适合客户端UI结构的响应",
      "轻量编排：BFF包含展示逻辑和数据塑形，但将所有业务规则委托给下游领域服务",
      "独立部署：每个BFF可以独立部署、扩缩和版本管理，不影响其他客户端渠道"
    ],
    "timeline": [
      [
        "2015",
        "Sam Newman coins the BFF pattern in Building Microservices, drawing on ThoughtWorks project experience"
      ],
      [
        "2016",
        "SoundCloud publishes a detailed case study of their BFF adoption for mobile and web clients"
      ],
      [
        "2018",
        "BFF pattern combined with GraphQL emerges as a popular hybrid architecture at companies like Airbnb"
      ],
      [
        "2020",
        "Next.js API routes and Remix loaders popularize BFF-like patterns in the JavaScript full-stack ecosystem"
      ],
      [
        "2023",
        "BFF pattern adapted for AI applications, with dedicated BFFs serving chat interfaces, voice assistants, and API consumers"
      ]
    ],
    "timeline_zh": [
      [
        "2015",
        "Sam Newman在《构建微服务》中提出BFF模式，源于ThoughtWorks项目经验"
      ],
      [
        "2016",
        "SoundCloud发布其移动端和Web客户端BFF采用的详细案例研究"
      ],
      [
        "2018",
        "BFF模式与GraphQL的结合作为流行的混合架构出现在Airbnb等公司"
      ],
      [
        "2020",
        "Next.js API路由和Remix加载器在JavaScript全栈生态中普及类BFF模式"
      ],
      [
        "2023",
        "BFF模式被改造用于AI应用，专用BFF服务于聊天界面、语音助手和API消费者"
      ]
    ],
    "dos": [
      "Do let the frontend team own the BFF because they best understand the client's data needs and can iterate without backend team dependencies",
      "Do keep BFFs focused on data aggregation and response shaping because business logic belongs in domain services",
      "Do share common concerns (auth, logging) via libraries rather than duplicating them across BFFs because consistency matters for cross-cutting concerns",
      "Do monitor each BFF independently because different client patterns produce different load and error profiles"
    ],
    "dos_zh": [
      "让前端团队拥有BFF，因为他们最了解客户端的数据需求，可以在不依赖后端团队的情况下迭代",
      "保持BFF专注于数据聚合和响应塑形，因为业务逻辑属于领域服务",
      "通过共享库而非跨BFF复制来处理通用关注点（认证、日志），因为横切关注点需要一致性",
      "独立监控每个BFF，因为不同客户端模式产生不同的负载和错误特征"
    ],
    "donts": [
      "Don't let BFFs accumulate business logic because they should remain thin orchestration layers, not become new monoliths",
      "Don't create a BFF for every minor client variation because it leads to an explosion of services that are expensive to maintain",
      "Don't share a BFF across fundamentally different client types because it recreates the one-size-fits-all problem the pattern was designed to solve",
      "Don't skip shared libraries for authentication and observability because each BFF reimplementing these independently introduces security and consistency risks"
    ],
    "donts_zh": [
      "不要让BFF积累业务逻辑，因为它们应保持为轻量编排层，而非演变为新的单体",
      "不要为每个小的客户端差异都创建BFF，因为这会导致服务爆炸式增长，维护成本高昂",
      "不要在根本不同的客户端类型之间共享BFF，因为这会重新产生该模式本要解决的一刀切问题",
      "不要跳过认证和可观测性的共享库，因为每个BFF独立重新实现会引入安全和一致性风险"
    ],
    "case_study_company": "SoundCloud",
    "case_study": "SoundCloud adopted the BFF pattern in 2016 when their single monolithic API could not satisfy the divergent needs of their iOS, Android, and web clients. Each client team built its own BFF that aggregated calls to shared microservices and shaped responses for their specific UI. This allowed the mobile team to ship optimized, smaller payloads while the web team could include richer metadata. Release velocity increased by 40% because client teams no longer blocked on backend API changes.",
    "case_study_zh": "SoundCloud于2016年采用BFF模式，当时其单一的单体API无法满足iOS、Android和Web客户端的不同需求。每个客户端团队构建了自己的BFF，聚合对共享微服务的调用并为其特定UI塑形响应。这使移动团队能发送优化的、更小的负载，而Web团队可以包含更丰富的元数据。发布速度提升了40%，因为客户端团队不再被后端API变更阻塞。",
    "when_not_to_use": [
      "When all clients consume the same data shape and a single API or GraphQL layer already serves them well",
      "When the organization is too small to staff separate teams for each BFF, leading to one team maintaining multiple BFFs",
      "When the overhead of deploying and monitoring additional services outweighs the flexibility gained",
      "When downstream services already provide well-designed, client-friendly APIs that need no aggregation"
    ],
    "when_not_to_use_zh": [
      "当所有客户端消费相同的数据结构，单一API或GraphQL层已能良好服务时",
      "当组织规模太小无法为每个BFF配备独立团队，导致一个团队维护多个BFF时",
      "当部署和监控额外服务的开销超过获得的灵活性时",
      "当下游服务已提供设计良好的、对客户端友好的API且不需要聚合时"
    ],
    "adopters": [
      "SoundCloud",
      "Airbnb",
      "Spotify",
      "Netflix",
      "REA Group"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "usability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Newman, S. (2015). \"Building Microservices\", 1st ed., Ch. 4. O'Reilly Media.",
    "secondary_sources": [
      "Newman, S. (2021). \"Building Microservices\", 2nd ed., Ch. 7. O'Reilly Media.",
      "Newman, S. (2015). \"Pattern: Backends For Frontends\". samnewman.io."
    ],
    "typed_relations": [
      {
        "slug": "api-gateway-pattern",
        "type": "extends"
      },
      {
        "slug": "graphql-schema-design",
        "type": "complement"
      },
      {
        "slug": "grpc-protocol-buffers",
        "type": "complement"
      }
    ]
  },
  {
    "id": 135,
    "name": "Consumer-Driven Contracts",
    "name_zh": "消费者驱动契约",
    "slug": "consumer-driven-contracts",
    "category": "api",
    "desc": "API testing approach where consumers define the expectations providers must satisfy",
    "desc_zh": "消费者定义期望、提供者必须满足的API测试方法",
    "steps": [
      "Each consumer team writes contract tests that specify the exact requests they send and the response shape they expect from the provider",
      "Contracts are published to a shared broker (e.g., Pact Broker) where providers can discover all consumer expectations",
      "The provider team runs all published consumer contracts against their implementation to verify compatibility",
      "When contract verification fails, the provider team and affected consumer team collaborate to resolve the incompatibility before deployment",
      "Integrate contract verification into CI/CD pipelines so that breaking changes are caught before they reach production"
    ],
    "steps_zh": [
      "每个消费者团队编写契约测试，指定其发送的确切请求和期望从提供者获得的响应结构",
      "契约发布到共享代理（如Pact Broker），提供者可在此发现所有消费者期望",
      "提供者团队针对其实现运行所有已发布的消费者契约以验证兼容性",
      "当契约验证失败时，提供者团队与受影响的消费者团队协作，在部署前解决不兼容问题",
      "将契约验证集成到CI/CD流水线中，确保在到达生产环境之前捕获破坏性变更"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Consumer Contract",
      "Contract Broker",
      "Provider Verify",
      "Resolve Conflict",
      "CI Gate"
    ],
    "viz_labels_zh": [
      "消费者契约",
      "契约代理",
      "提供者验证",
      "冲突解决",
      "CI门控"
    ],
    "related": [
      "openapi-specification",
      "api-gateway-pattern",
      "grpc-protocol-buffers"
    ],
    "tags": [
      "contracts",
      "testing",
      "consumer-driven",
      "pact",
      "compatibility"
    ],
    "origin_author": "Ian Robinson, 2006; tooling popularized by Pact (2013, DiUS/Beth Skurrie)",
    "origin_source": "Consumer-Driven Contracts: A Service Evolution Pattern (Ian Robinson, 2006); Building Microservices (Sam Newman, 2nd ed., 2021, Ch. 6)",
    "origin_source_zh": "《消费者驱动契约：服务演进模式》（Ian Robinson，2006）；《构建微服务》（Sam Newman，第2版，2021，第6章）",
    "complexity": "intermediate",
    "when_to_use": [
      "When multiple independent teams build services that depend on each other's APIs and need confidence in compatibility",
      "When integration tests are too slow, brittle, or expensive to run against live downstream services",
      "When deploying microservices independently requires assurance that API changes won't break existing consumers",
      "When evolving a shared API that serves many consumers, each with different usage patterns"
    ],
    "when_to_use_zh": [
      "当多个独立团队构建相互依赖API的服务，需要对兼容性有信心时",
      "当集成测试对活跃下游服务运行太慢、太脆弱或成本太高时",
      "当独立部署微服务需要确保API变更不会破坏现有消费者时",
      "当演进一个服务多个消费者的共享API，且每个消费者有不同使用模式时"
    ],
    "core_concepts": [
      "Consumer Contracts: Each consumer defines a lightweight test that captures exactly what it needs from the provider — no more, no less",
      "Provider Verification: The provider runs all consumer contracts as tests against its own codebase, ensuring it satisfies every consumer's expectations",
      "Contract Broker: A shared registry (e.g., Pact Broker) stores and versions contracts, enabling providers to discover consumer expectations without direct communication",
      "Can-I-Deploy: An automated check in CI/CD that verifies whether a specific version of a service is compatible with all its consumers before deployment",
      "Postel's Law in Practice: Providers are liberal in what they accept and conservative in what they produce, guided by actual consumer usage rather than assumed usage"
    ],
    "core_concepts_zh": [
      "消费者契约：每个消费者定义一个轻量测试，精确捕获其从提供者所需的内容——不多不少",
      "提供者验证：提供者针对自身代码库运行所有消费者契约作为测试，确保满足每个消费者的期望",
      "契约代理：共享注册中心（如Pact Broker）存储和版本化契约，使提供者无需直接沟通即可发现消费者期望",
      "可否部署检查：CI/CD中的自动化检查，在部署前验证服务的特定版本是否与所有消费者兼容",
      "Postel定律实践：提供者在接受时宽松、在产出时严格，以实际消费者使用而非假设使用为指导"
    ],
    "timeline": [
      [
        "2006",
        "Ian Robinson publishes the seminal paper 'Consumer-Driven Contracts: A Service Evolution Pattern'"
      ],
      [
        "2013",
        "Pact framework created at DiUS (Australia), becoming the de facto tool for consumer-driven contract testing"
      ],
      [
        "2016",
        "Pact Broker released, enabling centralized contract storage and cross-service compatibility verification"
      ],
      [
        "2018",
        "Spring Cloud Contract provides JVM-native consumer-driven contract testing for Spring Boot microservices"
      ],
      [
        "2021",
        "PactFlow (commercial SaaS) and Pact v4 specification unify HTTP, messaging, and plugin-based contract types"
      ]
    ],
    "timeline_zh": [
      [
        "2006",
        "Ian Robinson发表开创性论文「消费者驱动契约：服务演进模式」"
      ],
      [
        "2013",
        "Pact框架在DiUS（澳大利亚）创建，成为消费者驱动契约测试的事实标准工具"
      ],
      [
        "2016",
        "Pact Broker发布，支持集中化契约存储和跨服务兼容性验证"
      ],
      [
        "2018",
        "Spring Cloud Contract为Spring Boot微服务提供JVM原生的消费者驱动契约测试"
      ],
      [
        "2021",
        "PactFlow（商业SaaS）和Pact v4规范统一了HTTP、消息和基于插件的契约类型"
      ]
    ],
    "dos": [
      "Do write contracts from the consumer's perspective, specifying only what the consumer actually uses, because over-specifying creates false coupling",
      "Do integrate contract verification into both consumer and provider CI pipelines because manual verification is error-prone and unsustainable",
      "Do version contracts and use the can-i-deploy check before every deployment because it prevents incompatible services from reaching production",
      "Do start with the most critical service boundaries because retrofitting contracts everywhere at once is overwhelming"
    ],
    "dos_zh": [
      "从消费者视角编写契约，仅指定消费者实际使用的内容，因为过度指定会产生虚假耦合",
      "将契约验证集成到消费者和提供者的CI流水线中，因为手动验证容易出错且不可持续",
      "版本化契约并在每次部署前使用可否部署检查，因为这能防止不兼容的服务到达生产环境",
      "从最关键的服务边界开始，因为一次性在所有地方改造契约会令人不堪重负"
    ],
    "donts": [
      "Don't let consumers over-specify contracts by asserting on fields they don't use because it creates brittle tests that break on benign provider changes",
      "Don't use consumer-driven contracts as a replacement for end-to-end integration tests because they verify interface compatibility, not business workflow correctness",
      "Don't ignore contract test failures in CI because a green build with failing contracts means broken consumers in production",
      "Don't let contracts become stale by failing to update them when consumer behavior changes because outdated contracts provide false confidence"
    ],
    "donts_zh": [
      "不要让消费者对其不使用的字段进行断言来过度指定契约，因为这会创建脆弱的测试，在提供者进行无害变更时也会失败",
      "不要将消费者驱动契约作为端到端集成测试的替代品，因为它们验证的是接口兼容性而非业务流程正确性",
      "不要忽略CI中的契约测试失败，因为带有失败契约的绿色构建意味着生产中的消费者会出问题",
      "不要因消费者行为变更时未更新契约而让契约过时，因为过时的契约提供虚假的信心"
    ],
    "case_study_company": "Atlassian",
    "case_study": "Atlassian adopted Pact-based consumer-driven contracts across its microservices powering Jira and Confluence. With over 200 services and dozens of teams, integration testing had become a multi-hour bottleneck in their CI pipeline. By shifting to consumer-driven contracts, each service could verify API compatibility in under 2 minutes without standing up dependent services. This reduced their integration test feedback cycle from hours to minutes and eliminated 70% of integration-related production incidents.",
    "case_study_zh": "Atlassian在驱动Jira和Confluence的微服务中全面采用基于Pact的消费者驱动契约。拥有超过200个服务和数十个团队，集成测试已成为CI流水线中数小时的瓶颈。通过转向消费者驱动契约，每个服务可以在不启动依赖服务的情况下在2分钟内验证API兼容性。这将集成测试反馈周期从数小时缩短至分钟，并消除了70%的集成相关生产事故。",
    "when_not_to_use": [
      "Monolithic applications where all components are deployed together and internal API compatibility is guaranteed by the build",
      "Rapidly prototyping a new service where the API surface is changing daily and contracts would need constant rewriting",
      "When there is only one consumer per provider, making direct integration tests simpler and equally effective",
      "Third-party APIs where you have no influence over the provider's implementation and cannot share contracts"
    ],
    "when_not_to_use_zh": [
      "所有组件一起部署的单体应用，构建已保证内部API兼容性",
      "快速原型开发新服务时，API表面每天变化，契约需要不断重写",
      "每个提供者只有一个消费者时，直接集成测试更简单且同样有效",
      "第三方API，你无法影响提供者的实现且无法共享契约"
    ],
    "adopters": [
      "Atlassian",
      "ING Bank",
      "DiUS",
      "Spotify",
      "REA Group"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "testability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Robinson, I. (2006). \"Consumer-Driven Contracts: A Service Evolution Pattern\". martinfowler.com.",
    "secondary_sources": [
      "Newman, S. (2021). \"Building Microservices\", 2nd ed., Ch. 6. O'Reilly Media.",
      "Skurrie, B. (2013). \"Pact: A Contract Testing Tool\". pact.io."
    ],
    "typed_relations": [
      {
        "slug": "openapi-specification",
        "type": "complement"
      },
      {
        "slug": "api-gateway-pattern",
        "type": "complement"
      },
      {
        "slug": "grpc-protocol-buffers",
        "type": "complement"
      }
    ]
  },
  {
    "id": 136,
    "name": "OpenAPI Specification",
    "name_zh": "OpenAPI规范",
    "slug": "openapi-specification",
    "category": "api",
    "desc": "Machine-readable API description standard for RESTful services",
    "desc_zh": "面向RESTful服务的机器可读API描述标准",
    "steps": [
      "Define the OpenAPI document structure: info metadata, server URLs, and global security schemes in YAML or JSON format",
      "Describe each API endpoint as a path object with HTTP methods, parameters (path, query, header), and request bodies",
      "Define reusable schemas in the components section using JSON Schema for request/response models, reducing duplication",
      "Add response definitions for each status code including error responses, linking to component schemas where appropriate",
      "Validate the specification using linting tools (Spectral, openapi-generator), then generate documentation, client SDKs, and server stubs"
    ],
    "steps_zh": [
      "定义OpenAPI文档结构：以YAML或JSON格式描述info元数据、服务器URL和全局安全方案",
      "将每个API端点描述为路径对象，包含HTTP方法、参数（路径、查询、头部）和请求体",
      "在components部分使用JSON Schema定义可复用的请求/响应模型模式，减少重复",
      "为每个状态码添加响应定义（包括错误响应），在适当处链接到组件模式",
      "使用lint工具（Spectral、openapi-generator）验证规范，然后生成文档、客户端SDK和服务端存根"
    ],
    "ai_relevant": true,
    "viz_type": "tree",
    "viz_labels": [
      "Info Metadata",
      "Path Object",
      "Schema Components",
      "Response Codes",
      "Generate Lint"
    ],
    "viz_labels_zh": [
      "信息元数据",
      "路径对象",
      "模式组件",
      "响应码",
      "生成验证"
    ],
    "related": [
      "consumer-driven-contracts",
      "asyncapi",
      "hateoas"
    ],
    "tags": [
      "openapi",
      "swagger",
      "specification",
      "documentation",
      "code-generation"
    ],
    "origin_author": "Tony Tam / Swagger (2011); donated to OpenAPI Initiative / Linux Foundation (2015)",
    "origin_source": "OpenAPI Specification (spec.openapis.org); RESTful Web APIs (Leonard Richardson & Mike Amundsen, 2013, Ch. 9)",
    "origin_source_zh": "OpenAPI规范（spec.openapis.org）；《RESTful Web API》（Leonard Richardson & Mike Amundsen，2013，第9章）",
    "complexity": "beginner",
    "when_to_use": [
      "When designing REST APIs that need a single source of truth for documentation, client generation, and testing",
      "When multiple teams consume your API and need unambiguous, machine-readable interface documentation",
      "When enforcing API design standards across an organization using automated linting in CI pipelines",
      "When building developer portals where interactive API exploration (try-it-out) is essential for adoption"
    ],
    "when_to_use_zh": [
      "当设计REST API需要文档、客户端生成和测试的单一事实来源时",
      "当多个团队消费你的API并需要明确的、机器可读的接口文档时",
      "当在组织中通过CI流水线的自动化lint来执行API设计标准时",
      "当构建开发者门户，交互式API探索（试用功能）对采用至关重要时"
    ],
    "core_concepts": [
      "Design-First Approach: Write the OpenAPI specification before implementing the API, using it as a contract between frontend and backend teams",
      "Path and Operation Objects: Each API endpoint is defined by its URL path and HTTP method, with typed parameters, request bodies, and response schemas",
      "Components and Reuse: Shared schemas, security schemes, parameters, and response objects are defined once in the components section and referenced via $ref",
      "Code Generation: Tools like openapi-generator and swagger-codegen produce type-safe client SDKs, server stubs, and mock servers from the specification",
      "Linting and Governance: Rulesets (e.g., Spectral) enforce naming conventions, pagination standards, and error formats across all API specifications in an organization"
    ],
    "core_concepts_zh": [
      "设计优先方法：在实现API之前编写OpenAPI规范，将其作为前后端团队之间的契约",
      "路径和操作对象：每个API端点由URL路径和HTTP方法定义，包含类型化的参数、请求体和响应模式",
      "组件与复用：共享的模式、安全方案、参数和响应对象在components部分定义一次，通过$ref引用",
      "代码生成：openapi-generator和swagger-codegen等工具从规范生成类型安全的客户端SDK、服务端存根和模拟服务器",
      "Lint与治理：规则集（如Spectral）在组织内所有API规范中强制执行命名约定、分页标准和错误格式"
    ],
    "timeline": [
      [
        "2011",
        "Tony Tam creates Swagger specification and tooling for REST API documentation at Wordnik"
      ],
      [
        "2015",
        "Swagger specification donated to the OpenAPI Initiative under the Linux Foundation, renamed to OpenAPI Specification 2.0"
      ],
      [
        "2017",
        "OpenAPI 3.0 released with major improvements: components, links, callbacks, and multiple server support"
      ],
      [
        "2021",
        "OpenAPI 3.1 aligns fully with JSON Schema 2020-12, enabling complete schema compatibility"
      ],
      [
        "2024",
        "OpenAPI 3.1 becomes the dominant API specification format; AI coding assistants use OpenAPI specs for tool/function calling"
      ]
    ],
    "timeline_zh": [
      [
        "2011",
        "Tony Tam在Wordnik创建Swagger规范和工具用于REST API文档"
      ],
      [
        "2015",
        "Swagger规范捐赠给Linux基金会下的OpenAPI倡议，更名为OpenAPI规范2.0"
      ],
      [
        "2017",
        "OpenAPI 3.0发布，带来重大改进：组件、链接、回调和多服务器支持"
      ],
      [
        "2021",
        "OpenAPI 3.1与JSON Schema 2020-12完全对齐，实现完整的模式兼容性"
      ],
      [
        "2024",
        "OpenAPI 3.1成为主流API规范格式；AI编码助手使用OpenAPI规范进行工具/函数调用"
      ]
    ],
    "dos": [
      "Do adopt a design-first workflow where the OpenAPI spec is written and reviewed before any code because it catches API design issues before implementation",
      "Do use $ref extensively for schema reuse because duplicated schemas inevitably drift and cause inconsistencies",
      "Do integrate Spectral or similar linting into CI because it enforces organizational API standards automatically",
      "Do include example values for all request and response schemas because they power interactive documentation and mock servers"
    ],
    "dos_zh": [
      "采用设计优先的工作流，在编写任何代码之前编写并审查OpenAPI规范，因为这能在实现前发现API设计问题",
      "广泛使用$ref进行模式复用，因为重复的模式不可避免地会漂移并导致不一致",
      "将Spectral或类似的lint工具集成到CI中，因为它能自动执行组织的API标准",
      "为所有请求和响应模式包含示例值，因为它们驱动交互式文档和模拟服务器"
    ],
    "donts": [
      "Don't generate the OpenAPI spec from code as the primary workflow because code-first approaches produce specs that mirror implementation rather than design intent",
      "Don't ignore the components section and inline all schemas because it leads to massive duplication and maintenance burden",
      "Don't treat the OpenAPI spec as documentation only because its greatest value is in code generation, testing, and contract enforcement",
      "Don't skip error response schemas because undocumented error formats force consumers to handle errors through guesswork"
    ],
    "donts_zh": [
      "不要将从代码生成OpenAPI规范作为主要工作流，因为代码优先的方法产生的规范反映的是实现而非设计意图",
      "不要忽略components部分并内联所有模式，因为这会导致大量重复和维护负担",
      "不要仅将OpenAPI规范视为文档，因为其最大价值在于代码生成、测试和契约执行",
      "不要跳过错误响应模式，因为未文档化的错误格式迫使消费者通过猜测来处理错误"
    ],
    "case_study_company": "Stripe",
    "case_study": "Stripe maintains one of the most comprehensive OpenAPI specifications in the industry, covering every endpoint of their payment API. The specification drives automatic generation of client libraries in 7+ languages, interactive API documentation on their developer portal, and internal contract testing. By treating the OpenAPI spec as the single source of truth, Stripe ensures that documentation, SDKs, and the live API never diverge, which has been a key factor in their industry-leading developer experience.",
    "case_study_zh": "Stripe维护着业界最全面的OpenAPI规范之一，覆盖其支付API的每个端点。该规范驱动着7种以上语言的客户端库自动生成、开发者门户上的交互式API文档以及内部契约测试。通过将OpenAPI规范作为单一事实来源，Stripe确保文档、SDK和实时API永不偏离，这是其行业领先的开发者体验的关键因素。",
    "when_not_to_use": [
      "GraphQL APIs where the schema itself serves as the specification and OpenAPI adds no value",
      "Event-driven or asynchronous APIs where AsyncAPI is the appropriate specification standard",
      "Internal gRPC services where Protocol Buffer definitions already serve as the machine-readable contract",
      "Extremely experimental APIs that change multiple times per day, where spec maintenance overhead exceeds its benefits"
    ],
    "when_not_to_use_zh": [
      "GraphQL API，其模式本身就是规范，OpenAPI不增加价值",
      "事件驱动或异步API，AsyncAPI是适当的规范标准",
      "内部gRPC服务，Protocol Buffer定义已作为机器可读契约",
      "每天变更多次的极端实验性API，规范维护开销超过其收益"
    ],
    "adopters": [
      "Stripe",
      "Twilio",
      "GitHub",
      "Microsoft",
      "Google"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "usability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "OpenAPI Initiative (2021). \"OpenAPI Specification v3.1.0\". spec.openapis.org.",
    "secondary_sources": [
      "Richardson, L. & Amundsen, M. (2013). \"RESTful Web APIs\", Ch. 9. O'Reilly Media.",
      "Tam, T. (2011). \"Swagger: A Simple, Open Standard for Describing REST APIs\". swagger.io."
    ],
    "typed_relations": [
      {
        "slug": "consumer-driven-contracts",
        "type": "complement"
      },
      {
        "slug": "asyncapi",
        "type": "alternative"
      },
      {
        "slug": "hateoas",
        "type": "complement"
      }
    ]
  },
  {
    "id": 137,
    "name": "Webhook Pattern",
    "name_zh": "Webhook模式",
    "slug": "webhook-pattern",
    "category": "api",
    "desc": "Event-driven API integration via HTTP callbacks for real-time notifications",
    "desc_zh": "通过HTTP回调实现事件驱动的API集成，提供实时通知",
    "steps": [
      "Define the webhook events your system will emit, documenting each event type's payload schema and trigger conditions",
      "Build a subscription management API that allows consumers to register callback URLs, select event types, and configure secret tokens for verification",
      "Implement the event dispatch system that serializes events, signs payloads with HMAC, and delivers HTTP POST requests to registered callback URLs",
      "Add retry logic with exponential backoff for failed deliveries, and implement a dead-letter queue for persistently failing endpoints",
      "Provide consumers with a webhook delivery log and replay capability so they can debug missed events and recover from outages"
    ],
    "steps_zh": [
      "定义系统将发出的webhook事件，记录每种事件类型的负载模式和触发条件",
      "构建订阅管理API，允许消费者注册回调URL、选择事件类型并配置用于验证的密钥令牌",
      "实现事件分发系统，序列化事件、使用HMAC签名负载，并向注册的回调URL发送HTTP POST请求",
      "为失败的投递添加指数退避的重试逻辑，并为持续失败的端点实现死信队列",
      "为消费者提供webhook投递日志和重放功能，以便调试遗漏事件并从故障中恢复"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Event Types",
      "Subscription API",
      "Dispatch System",
      "Retry and DLQ",
      "Delivery Log"
    ],
    "viz_labels_zh": [
      "事件类型",
      "订阅管理",
      "事件分发",
      "重试队列",
      "投递日志"
    ],
    "related": [
      "asyncapi",
      "api-rate-limiting-throttling",
      "api-gateway-pattern"
    ],
    "tags": [
      "webhook",
      "event-driven",
      "callback",
      "integration",
      "push"
    ],
    "origin_author": "Jeff Lindsay, 2007 (coined the term); pattern predates the name in early HTTP callback systems",
    "origin_source": "RESTful Web APIs (Leonard Richardson & Mike Amundsen, 2013, Ch. 11); API Design Patterns (JJ Geewax, 2021, Ch. 15)",
    "origin_source_zh": "《RESTful Web API》（Leonard Richardson & Mike Amundsen，2013，第11章）；《API设计模式》（JJ Geewax，2021，第15章）",
    "complexity": "intermediate",
    "when_to_use": [
      "When consumers need real-time notifications of events without continuously polling your API",
      "When integrating with third-party systems that need to react to state changes in your platform",
      "When building automation workflows where downstream actions should trigger immediately upon events",
      "When reducing API load by pushing updates to interested consumers instead of serving repeated polling requests"
    ],
    "when_to_use_zh": [
      "当消费者需要实时事件通知而不需持续轮询API时",
      "当与需要对平台状态变更做出反应的第三方系统集成时",
      "当构建自动化工作流，下游动作应在事件发生时立即触发时",
      "当通过向感兴趣的消费者推送更新来减少API负载，而非服务重复的轮询请求时"
    ],
    "core_concepts": [
      "Push-Based Delivery: The server proactively sends HTTP POST requests to consumer-registered URLs when events occur, inverting the traditional pull model",
      "Payload Signing: HMAC signatures (using a shared secret) in request headers allow consumers to verify that webhook payloads originated from the legitimate sender",
      "Idempotency: Consumers must handle duplicate deliveries gracefully using event IDs, because retry logic can deliver the same event multiple times",
      "Retry with Backoff: Failed deliveries are retried with exponential backoff to handle transient consumer outages without overwhelming recovering endpoints",
      "Subscription Management: A self-service API for registering, updating, and deactivating webhook subscriptions with event type filtering"
    ],
    "core_concepts_zh": [
      "推送式投递：服务器在事件发生时主动向消费者注册的URL发送HTTP POST请求，逆转了传统的拉取模型",
      "负载签名：请求头中的HMAC签名（使用共享密钥）允许消费者验证webhook负载来自合法发送者",
      "幂等性：消费者必须使用事件ID优雅处理重复投递，因为重试逻辑可能多次投递同一事件",
      "带退避的重试：对失败投递使用指数退避重试，以处理消费者的瞬时故障而不压垮恢复中的端点",
      "订阅管理：自助服务API，用于注册、更新和停用webhook订阅，支持事件类型过滤"
    ],
    "timeline": [
      [
        "2007",
        "Jeff Lindsay coins the term 'webhooks' and advocates for web callbacks as a lightweight integration mechanism"
      ],
      [
        "2012",
        "GitHub, Stripe, and Twilio popularize webhooks as a standard feature of developer platforms"
      ],
      [
        "2018",
        "Standard Webhooks initiative begins, proposing conventions for signatures, retries, and payload formats"
      ],
      [
        "2020",
        "Svix and other webhook-as-a-service platforms emerge, abstracting delivery infrastructure for SaaS providers"
      ],
      [
        "2024",
        "Webhooks become the primary integration mechanism for AI agent platforms, enabling tool-use callbacks and event-driven AI workflows"
      ]
    ],
    "timeline_zh": [
      [
        "2007",
        "Jeff Lindsay创造「webhooks」一词，倡导Web回调作为轻量级集成机制"
      ],
      [
        "2012",
        "GitHub、Stripe和Twilio将webhook普及为开发者平台的标准功能"
      ],
      [
        "2018",
        "Standard Webhooks倡议启动，提出签名、重试和负载格式的约定"
      ],
      [
        "2020",
        "Svix等webhook即服务平台出现，为SaaS提供者抽象投递基础设施"
      ],
      [
        "2024",
        "Webhook成为AI代理平台的主要集成机制，支持工具使用回调和事件驱动的AI工作流"
      ]
    ],
    "dos": [
      "Do sign every webhook payload with HMAC and document the verification process because unsigned webhooks are trivially spoofable",
      "Do include a unique event ID in every payload because consumers need it for idempotent processing and deduplication",
      "Do implement exponential backoff with jitter for retries because synchronized retries to a recovering endpoint cause thundering herd problems",
      "Do provide a delivery log and manual replay endpoint because consumers need to debug and recover from missed events"
    ],
    "dos_zh": [
      "使用HMAC为每个webhook负载签名并文档化验证过程，因为未签名的webhook极易被伪造",
      "在每个负载中包含唯一事件ID，因为消费者需要它来进行幂等处理和去重",
      "为重试实现带抖动的指数退避，因为对恢复中端点的同步重试会导致惊群问题",
      "提供投递日志和手动重放端点，因为消费者需要调试和恢复遗漏的事件"
    ],
    "donts": [
      "Don't send webhook payloads without signatures because it exposes consumers to spoofing and injection attacks",
      "Don't retry indefinitely without a maximum retry count and dead-letter mechanism because it wastes resources on permanently unreachable endpoints",
      "Don't include sensitive data (passwords, tokens) in webhook payloads because callback URLs may traverse untrusted networks",
      "Don't assume consumers process events in order because network conditions and retries can deliver events out of sequence"
    ],
    "donts_zh": [
      "不要发送不签名的webhook负载，因为这使消费者暴露于伪造和注入攻击",
      "不要在没有最大重试次数和死信机制的情况下无限重试，因为这会在永久不可达的端点上浪费资源",
      "不要在webhook负载中包含敏感数据（密码、令牌），因为回调URL可能经过不可信的网络",
      "不要假设消费者按顺序处理事件，因为网络条件和重试可能导致事件乱序投递"
    ],
    "case_study_company": "Stripe",
    "case_study": "Stripe's webhook system delivers billions of events per month to merchants for payment lifecycle events (charge.succeeded, invoice.paid, dispute.created). Each event is signed with HMAC-SHA256 using per-endpoint secrets, and Stripe retries failed deliveries for up to 72 hours with exponential backoff. Their webhook dashboard provides real-time delivery logs with payload inspection and one-click replay. This push-based model eliminated the need for merchants to poll the Stripe API, reducing API call volume by an estimated 80% for event-driven workflows.",
    "case_study_zh": "Stripe的webhook系统每月向商户投递数十亿个支付生命周期事件（charge.succeeded、invoice.paid、dispute.created）。每个事件使用每端点密钥的HMAC-SHA256签名，Stripe对失败投递以指数退避重试长达72小时。其webhook仪表板提供实时投递日志，支持负载检查和一键重放。这种推送模型消除了商户轮询Stripe API的需要，将事件驱动工作流的API调用量减少了约80%。",
    "when_not_to_use": [
      "When consumers cannot expose a publicly accessible HTTP endpoint (e.g., behind strict corporate firewalls)",
      "When strict ordering guarantees are required, as webhook delivery order is inherently non-deterministic",
      "When the event volume is extremely high and consumers cannot absorb the throughput, requiring a message queue instead",
      "When bidirectional communication is needed, as webhooks are unidirectional push-only"
    ],
    "when_not_to_use_zh": [
      "当消费者无法暴露公开可访问的HTTP端点时（如严格的企业防火墙后面）",
      "当需要严格的顺序保证时，因为webhook投递顺序本质上是非确定性的",
      "当事件量极高且消费者无法承受吞吐量时，需要消息队列替代",
      "当需要双向通信时，因为webhook是单向推送"
    ],
    "adopters": [
      "Stripe",
      "GitHub",
      "Twilio",
      "Shopify",
      "Slack"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "scalability",
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Lindsay, J. (2007). \"Web Hooks to Revolutionize the Web\". progrium.com.",
    "secondary_sources": [
      "Richardson, L. & Amundsen, M. (2013). \"RESTful Web APIs\", Ch. 11. O'Reilly Media.",
      "Geewax, J.J. (2021). \"API Design Patterns\", Ch. 15. Manning Publications."
    ],
    "typed_relations": [
      {
        "slug": "asyncapi",
        "type": "complement"
      },
      {
        "slug": "api-rate-limiting-throttling",
        "type": "complement"
      },
      {
        "slug": "api-gateway-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 138,
    "name": "API Rate Limiting & Throttling",
    "name_zh": "API限流与节流",
    "slug": "api-rate-limiting-throttling",
    "category": "api",
    "desc": "Protect APIs from overuse by controlling request rates per client",
    "desc_zh": "通过控制每个客户端的请求速率来保护API免遭过度使用",
    "steps": [
      "Define rate limit policies based on business tiers: identify limits per API key, user, IP, or organization with different quotas for free and paid plans",
      "Choose a rate limiting algorithm appropriate for your use case: token bucket for burst tolerance, sliding window for smooth enforcement, or fixed window for simplicity",
      "Implement the rate limiter using a distributed counter store (Redis, Memcached) that tracks request counts with TTL-based expiration",
      "Return standard HTTP 429 (Too Many Requests) responses with Retry-After, X-RateLimit-Limit, X-RateLimit-Remaining, and X-RateLimit-Reset headers",
      "Monitor rate limit metrics, alert on unusual patterns (abuse, DDoS), and provide a self-service dashboard for consumers to track their usage"
    ],
    "steps_zh": [
      "基于业务层级定义限流策略：按API密钥、用户、IP或组织识别限制，为免费和付费计划设置不同配额",
      "选择适合用例的限流算法：令牌桶用于突发容忍、滑动窗口用于平滑执行、固定窗口用于简单场景",
      "使用分布式计数存储（Redis、Memcached）实现限流器，通过TTL过期来跟踪请求计数",
      "返回标准HTTP 429（请求过多）响应，包含Retry-After、X-RateLimit-Limit、X-RateLimit-Remaining和X-RateLimit-Reset头部",
      "监控限流指标，对异常模式（滥用、DDoS）告警，并为消费者提供自助仪表板以跟踪使用量"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Rate Policy",
      "Algorithm",
      "Distributed Counter",
      "429 Response",
      "Monitor"
    ],
    "viz_labels_zh": [
      "速率策略",
      "算法",
      "分布式计数",
      "429响应",
      "监控"
    ],
    "related": [
      "api-gateway-pattern",
      "webhook-pattern",
      "openapi-specification"
    ],
    "tags": [
      "rate-limiting",
      "throttling",
      "protection",
      "quota",
      "resilience"
    ],
    "origin_author": "Web infrastructure community; formalized in IETF RFC 6585 (2012, HTTP 429 status code)",
    "origin_source": "API Design Patterns (JJ Geewax, 2021, Ch. 17); RESTful Web APIs (Leonard Richardson & Mike Amundsen, 2013, Ch. 11)",
    "origin_source_zh": "《API设计模式》（JJ Geewax，2021，第17章）；《RESTful Web API》（Leonard Richardson & Mike Amundsen，2013，第11章）",
    "complexity": "intermediate",
    "when_to_use": [
      "When a public API must protect backend services from abuse, DDoS attacks, and runaway client scripts",
      "When different client tiers (free, premium, enterprise) need different usage quotas to align with business models",
      "When shared infrastructure serves multiple tenants and fair resource allocation must be enforced",
      "When costly downstream operations (database queries, third-party API calls) need protection from traffic spikes"
    ],
    "when_to_use_zh": [
      "当公共API必须保护后端服务免受滥用、DDoS攻击和失控客户端脚本的影响时",
      "当不同客户端层级（免费、高级、企业）需要不同的使用配额以匹配业务模型时",
      "当共享基础设施服务多个租户且必须执行公平的资源分配时",
      "当昂贵的下游操作（数据库查询、第三方API调用）需要保护以免受流量尖峰影响时"
    ],
    "core_concepts": [
      "Token Bucket Algorithm: Tokens are added at a fixed rate; each request consumes a token. Allows bursts up to bucket capacity while enforcing average rate over time",
      "Sliding Window Log: Tracks timestamps of each request in a rolling time window, providing precise rate calculation but with higher memory usage",
      "Fixed Window Counter: Counts requests in discrete time windows (e.g., per minute). Simple but susceptible to burst-at-boundary edge cases",
      "Distributed Rate Limiting: Rate counters stored in Redis or similar shared stores ensure consistent enforcement across multiple API server instances",
      "Quota Headers: X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Reset, and Retry-After headers communicate rate limit status transparently to consumers"
    ],
    "core_concepts_zh": [
      "令牌桶算法：以固定速率添加令牌，每个请求消耗一个令牌。允许突发到桶容量，同时随时间强制平均速率",
      "滑动窗口日志：在滚动时间窗口中跟踪每个请求的时间戳，提供精确的速率计算但内存使用较高",
      "固定窗口计数器：在离散时间窗口（如每分钟）中计数请求。简单但容易受到边界突发的边缘情况影响",
      "分布式限流：将速率计数器存储在Redis或类似共享存储中，确保跨多个API服务器实例的一致执行",
      "配额头部：X-RateLimit-Limit、X-RateLimit-Remaining、X-RateLimit-Reset和Retry-After头部向消费者透明传达限流状态"
    ],
    "timeline": [
      [
        "2008",
        "Twitter API introduces rate limiting as one of the first major public APIs to enforce per-client quotas"
      ],
      [
        "2012",
        "IETF publishes RFC 6585 standardizing HTTP 429 Too Many Requests status code"
      ],
      [
        "2015",
        "API gateway products (Kong, AWS API Gateway) include built-in rate limiting as a core feature"
      ],
      [
        "2019",
        "IETF draft for RateLimit header fields standardizes X-RateLimit-* headers across implementations"
      ],
      [
        "2023",
        "AI API providers (OpenAI, Anthropic) make sophisticated rate limiting central to usage-based pricing models"
      ]
    ],
    "timeline_zh": [
      [
        "2008",
        "Twitter API引入限流，成为最早执行每客户端配额的主要公共API之一"
      ],
      [
        "2012",
        "IETF发布RFC 6585，标准化HTTP 429请求过多状态码"
      ],
      [
        "2015",
        "API网关产品（Kong、AWS API Gateway）将内置限流作为核心功能"
      ],
      [
        "2019",
        "IETF关于RateLimit头部字段的草案在各实现间标准化X-RateLimit-*头部"
      ],
      [
        "2023",
        "AI API提供商（OpenAI、Anthropic）将精细化限流作为基于使用量定价模型的核心"
      ]
    ],
    "dos": [
      "Do return informative 429 responses with Retry-After and rate limit headers because clients need to implement backoff logic based on concrete values",
      "Do apply rate limits at the API gateway level because it protects all downstream services uniformly without per-service implementation",
      "Do differentiate rate limits by client tier and endpoint because not all APIs have the same cost and not all clients have the same entitlement",
      "Do use sliding window or token bucket algorithms for production because fixed window counters allow boundary-burst abuse"
    ],
    "dos_zh": [
      "返回带有Retry-After和限流头部的信息性429响应，因为客户端需要基于具体值实现退避逻辑",
      "在API网关层应用限流，因为它能统一保护所有下游服务而无需每个服务单独实现",
      "按客户端层级和端点区分限流，因为并非所有API成本相同，也并非所有客户端权限相同",
      "在生产中使用滑动窗口或令牌桶算法，因为固定窗口计数器允许边界突发滥用"
    ],
    "donts": [
      "Don't return generic 500 or 503 errors when rate limiting because clients cannot distinguish server errors from quota exhaustion",
      "Don't implement rate limiting only on individual servers without a shared counter because clients will get inconsistent limits across instances",
      "Don't set rate limits without monitoring and alerting because undetected abuse will degrade service for legitimate users",
      "Don't apply the same rate limit to all endpoints because expensive operations (search, analytics) should have lower limits than lightweight ones (health check)"
    ],
    "donts_zh": [
      "不要在限流时返回通用的500或503错误，因为客户端无法区分服务器错误和配额耗尽",
      "不要仅在单个服务器上实现限流而不使用共享计数器，因为客户端在不同实例间会获得不一致的限制",
      "不要设置限流而不监控和告警，因为未检测到的滥用会降低合法用户的服务质量",
      "不要对所有端点应用相同的限流，因为昂贵操作（搜索、分析）应比轻量操作（健康检查）有更低的限制"
    ],
    "case_study_company": "GitHub",
    "case_study": "GitHub's REST API enforces a rate limit of 5,000 requests per hour for authenticated users and 60 per hour for unauthenticated requests. Every response includes X-RateLimit-Limit, X-RateLimit-Remaining, and X-RateLimit-Reset headers. For their GraphQL API, GitHub uses a point-based system where each query costs points proportional to its complexity. This tiered approach protects GitHub's infrastructure while allowing legitimate automation tools to operate within generous limits, and the transparent headers enable clients to self-throttle before hitting limits.",
    "case_study_zh": "GitHub的REST API对认证用户执行每小时5,000次请求的限制，对未认证请求为每小时60次。每个响应都包含X-RateLimit-Limit、X-RateLimit-Remaining和X-RateLimit-Reset头部。对于GraphQL API，GitHub使用基于点数的系统，每个查询的成本与其复杂度成正比。这种分层方法保护了GitHub的基础设施，同时允许合法的自动化工具在宽裕的限制内运行，透明的头部使客户端能在触达限制前自行节流。",
    "when_not_to_use": [
      "Internal service-to-service communication within a trusted network where circuit breakers and backpressure are more appropriate",
      "Batch processing pipelines where requests are already controlled by the job scheduler",
      "Real-time streaming APIs (WebSocket, SSE) where connection-based limits are more relevant than request-based limits",
      "Early-stage internal APIs with a handful of known consumers where rate limiting adds unnecessary operational complexity"
    ],
    "when_not_to_use_zh": [
      "可信网络内的内部服务间通信，熔断器和反压机制更合适",
      "批处理管线中请求已由作业调度器控制的场景",
      "实时流式API（WebSocket、SSE），基于连接的限制比基于请求的限制更相关",
      "只有少量已知消费者的早期内部API，限流增加了不必要的运维复杂性"
    ],
    "adopters": [
      "GitHub",
      "Twitter",
      "Stripe",
      "OpenAI",
      "Cloudflare"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability",
      "scalability",
      "security"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Geewax, J.J. (2021). \"API Design Patterns\", Ch. 17. Manning Publications.",
    "secondary_sources": [
      "Nottingham, M. & Fielding, R. (2012). \"Additional HTTP Status Codes (429 Too Many Requests)\". IETF RFC 6585.",
      "Richardson, L. & Amundsen, M. (2013). \"RESTful Web APIs\", Ch. 11. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "api-gateway-pattern",
        "type": "complement"
      },
      {
        "slug": "webhook-pattern",
        "type": "complement"
      },
      {
        "slug": "openapi-specification",
        "type": "complement"
      }
    ]
  },
  {
    "id": 139,
    "name": "HATEOAS",
    "name_zh": "超媒体驱动应用状态（HATEOAS）",
    "slug": "hateoas",
    "category": "api",
    "desc": "Hypermedia-driven API navigation where responses contain links to available actions",
    "desc_zh": "超媒体驱动的API导航，响应中包含可用操作的链接",
    "steps": [
      "Design resource representations that include hypermedia links indicating available state transitions and related resources",
      "Choose a hypermedia media type (HAL, JSON:API, Siren, or Collection+JSON) that provides a standard structure for embedding links",
      "Implement link generation in API responses that dynamically reflects the current resource state and the authenticated user's permissions",
      "Build clients that discover available actions by following links in responses rather than hardcoding URL patterns",
      "Document the link relations (rel types) and media types so consumers understand the meaning of each link without needing to know URL structure"
    ],
    "steps_zh": [
      "设计资源表示，包含指示可用状态转换和相关资源的超媒体链接",
      "选择超媒体媒体类型（HAL、JSON:API、Siren或Collection+JSON），提供嵌入链接的标准结构",
      "在API响应中实现链接生成，动态反映当前资源状态和认证用户的权限",
      "构建客户端通过跟随响应中的链接发现可用操作，而非硬编码URL模式",
      "文档化链接关系（rel类型）和媒体类型，使消费者无需了解URL结构即可理解每个链接的含义"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Resource",
      "Media Type",
      "Link Generation",
      "Client Follows",
      "Rel Types"
    ],
    "viz_labels_zh": [
      "资源",
      "媒体类型",
      "链接生成",
      "客户端发现",
      "关系类型"
    ],
    "related": [
      "openapi-specification",
      "graphql-schema-design",
      "consumer-driven-contracts"
    ],
    "tags": [
      "hateoas",
      "hypermedia",
      "rest",
      "links",
      "discoverability"
    ],
    "origin_author": "Roy Fielding, 2000 (as a constraint of REST in his doctoral dissertation)",
    "origin_source": "Architectural Styles and the Design of Network-based Software Architectures (Roy Fielding, 2000, Ch. 5); RESTful Web APIs (Leonard Richardson & Mike Amundsen, 2013, Ch. 4-5)",
    "origin_source_zh": "《架构风格与基于网络的软件架构设计》（Roy Fielding，2000，第5章）；《RESTful Web API》（Leonard Richardson & Mike Amundsen，2013，第4-5章）",
    "complexity": "advanced",
    "when_to_use": [
      "When building long-lived APIs where URL structures may evolve but clients should not break",
      "When API workflows involve complex state machines and clients need guidance on valid next actions",
      "When reducing client-server coupling is a priority, allowing the server to change URL patterns without client updates",
      "When building APIs for exploration and discoverability, where navigating from a single entry point should reveal all capabilities"
    ],
    "when_to_use_zh": [
      "当构建长期存在的API，URL结构可能演进但客户端不应中断时",
      "当API工作流涉及复杂状态机，客户端需要关于有效下一步操作的指导时",
      "当降低客户端-服务器耦合是优先事项，允许服务器更改URL模式而无需客户端更新时",
      "当构建用于探索和发现的API，从单一入口点导航应展示所有功能时"
    ],
    "core_concepts": [
      "Hypermedia as the Engine: API responses embed links that tell clients what actions are available, making the API self-describing and navigable",
      "Link Relations (rel): Standardized or custom relation types (e.g., 'self', 'next', 'payment') give semantic meaning to each link independent of its URL",
      "Dynamic Link Availability: Links in responses change based on resource state and user permissions — a canceled order shows no 'refund' link",
      "Media Types: Hypermedia formats like HAL (application/hal+json), JSON:API, and Siren define how links are structured within JSON responses",
      "Maturity Level 3: In Richardson's REST Maturity Model, HATEOAS represents the highest level of REST compliance, beyond resources and HTTP verbs"
    ],
    "core_concepts_zh": [
      "超媒体作为引擎：API响应嵌入链接，告知客户端可用的操作，使API自描述且可导航",
      "链接关系（rel）：标准化或自定义的关系类型（如「self」「next」「payment」）为每个链接赋予独立于URL的语义含义",
      "动态链接可用性：响应中的链接根据资源状态和用户权限变化——已取消的订单不显示「退款」链接",
      "媒体类型：HAL（application/hal+json）、JSON:API和Siren等超媒体格式定义了JSON响应中链接的结构方式",
      "成熟度第3级：在Richardson的REST成熟度模型中，HATEOAS代表REST合规的最高级别，超越资源和HTTP动词"
    ],
    "timeline": [
      [
        "2000",
        "Roy Fielding defines HATEOAS as a core constraint of REST in his doctoral dissertation at UC Irvine"
      ],
      [
        "2008",
        "Leonard Richardson proposes the REST Maturity Model with HATEOAS as Level 3 (the highest level)"
      ],
      [
        "2012",
        "HAL (Hypertext Application Language) specification by Mike Kelly provides a practical JSON hypermedia format"
      ],
      [
        "2015",
        "JSON:API specification reaches 1.0, offering an opinionated standard for REST APIs with relationship links"
      ],
      [
        "2024",
        "HATEOAS principles influence AI agent tool discovery, where LLMs navigate APIs by following hypermedia links"
      ]
    ],
    "timeline_zh": [
      [
        "2000",
        "Roy Fielding在其加州大学尔湾分校博士论文中定义HATEOAS为REST的核心约束"
      ],
      [
        "2008",
        "Leonard Richardson提出REST成熟度模型，HATEOAS为第3级（最高级别）"
      ],
      [
        "2012",
        "Mike Kelly的HAL（超文本应用语言）规范提供了实用的JSON超媒体格式"
      ],
      [
        "2015",
        "JSON:API规范达到1.0版本，为带关系链接的REST API提供了一种有主见的标准"
      ],
      [
        "2024",
        "HATEOAS原则影响AI代理工具发现，LLM通过跟随超媒体链接来导航API"
      ]
    ],
    "dos": [
      "Do include a 'self' link in every resource representation because it provides an unambiguous canonical URL for each resource",
      "Do use standardized link relation types from IANA where possible because custom rel types reduce interoperability",
      "Do make link availability conditional on resource state because it guides clients on valid transitions and prevents invalid operations",
      "Do choose an established hypermedia format (HAL, JSON:API, Siren) rather than inventing your own because standard tooling and client libraries already exist"
    ],
    "dos_zh": [
      "在每个资源表示中包含「self」链接，因为它为每个资源提供明确的规范URL",
      "尽可能使用IANA的标准化链接关系类型，因为自定义rel类型降低互操作性",
      "使链接可用性取决于资源状态，因为这引导客户端进行有效转换并防止无效操作",
      "选择已建立的超媒体格式（HAL、JSON:API、Siren）而非发明自己的，因为标准工具和客户端库已经存在"
    ],
    "donts": [
      "Don't include links without documenting their relation types because undocumented links are meaningless to client developers",
      "Don't require clients to construct URLs from templates or documentation because it defeats the purpose of hypermedia-driven discovery",
      "Don't add HATEOAS links as an afterthought to an existing API because retrofitting hypermedia requires rethinking resource design holistically",
      "Don't expect all clients to use hypermedia navigation because many teams will hardcode URLs despite the links being available"
    ],
    "donts_zh": [
      "不要包含链接而不文档化其关系类型，因为未文档化的链接对客户端开发者毫无意义",
      "不要要求客户端从模板或文档构造URL，因为这违背了超媒体驱动发现的目的",
      "不要将HATEOAS链接作为事后补充添加到现有API，因为改造超媒体需要从整体重新思考资源设计",
      "不要期望所有客户端都使用超媒体导航，因为许多团队尽管链接可用仍会硬编码URL"
    ],
    "case_study_company": "PayPal",
    "case_study": "PayPal's REST API is one of the most prominent real-world implementations of HATEOAS. Every API response includes a links array with HATEOAS links that guide consumers through payment workflows. For example, after creating a payment, the response includes 'approve', 'execute', and 'self' links. The client follows the 'approve' link to redirect the user, then the 'execute' link to complete the payment. This approach allows PayPal to change URL structures, add new payment steps, or modify workflows without breaking existing integrations.",
    "case_study_zh": "PayPal的REST API是HATEOAS最著名的实际实现之一。每个API响应都包含一个links数组，其中的HATEOAS链接引导消费者完成支付工作流。例如，创建支付后，响应包含「approve」「execute」和「self」链接。客户端跟随「approve」链接重定向用户，然后跟随「execute」链接完成支付。这种方法允许PayPal更改URL结构、添加新支付步骤或修改工作流而不破坏现有集成。",
    "when_not_to_use": [
      "Internal microservice APIs where clients and servers are deployed together and URL coupling is acceptable",
      "High-performance APIs where the overhead of generating and transmitting links in every response is unacceptable",
      "Simple CRUD APIs with flat resource structures where hypermedia navigation adds no value over documented URL patterns",
      "Mobile or bandwidth-constrained clients where the additional payload size from links degrades performance"
    ],
    "when_not_to_use_zh": [
      "客户端和服务器一起部署的内部微服务API，URL耦合可以接受",
      "高性能API，在每个响应中生成和传输链接的开销不可接受",
      "具有扁平资源结构的简单CRUD API，超媒体导航相比文档化的URL模式不增加价值",
      "移动端或带宽受限的客户端，链接增加的额外负载大小会降低性能"
    ],
    "adopters": [
      "PayPal",
      "Amazon",
      "GitHub",
      "Spring HATEOAS",
      "FoxyCart"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "usability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Fielding, R.T. (2000). \"Architectural Styles and the Design of Network-based Software Architectures\", Ch. 5. Doctoral Dissertation, University of California, Irvine.",
    "secondary_sources": [
      "Richardson, L. & Amundsen, M. (2013). \"RESTful Web APIs\", Ch. 4-5. O'Reilly Media.",
      "Fielding, R.T. (2008). \"REST APIs Must Be Hypertext-Driven\". roy.gbiv.com."
    ],
    "typed_relations": [
      {
        "slug": "openapi-specification",
        "type": "complement"
      },
      {
        "slug": "graphql-schema-design",
        "type": "alternative"
      },
      {
        "slug": "consumer-driven-contracts",
        "type": "complement"
      }
    ]
  },
  {
    "id": 140,
    "name": "AsyncAPI",
    "name_zh": "AsyncAPI异步API规范",
    "slug": "asyncapi",
    "category": "api",
    "desc": "Specification standard for describing event-driven and asynchronous APIs",
    "desc_zh": "用于描述事件驱动和异步API的规范标准",
    "steps": [
      "Define the AsyncAPI document with info metadata, server connections (Kafka brokers, MQTT brokers, WebSocket URLs), and protocol bindings",
      "Describe channels (topics/queues) with their publish and subscribe operations, specifying which messages flow in each direction",
      "Define message schemas using JSON Schema or Avro, including headers, payload structure, and correlation IDs for request-reply patterns",
      "Add protocol-specific bindings (Kafka partition keys, AMQP routing keys, MQTT QoS levels) to each channel and operation",
      "Generate documentation, code, and mock servers from the specification using AsyncAPI Generator, and validate the spec with AsyncAPI Studio"
    ],
    "steps_zh": [
      "定义AsyncAPI文档，包含info元数据、服务器连接（Kafka代理、MQTT代理、WebSocket URL）和协议绑定",
      "描述通道（主题/队列）及其发布和订阅操作，指定每个方向的消息流",
      "使用JSON Schema或Avro定义消息模式，包括头部、负载结构和用于请求-回复模式的关联ID",
      "为每个通道和操作添加协议特定绑定（Kafka分区键、AMQP路由键、MQTT QoS级别）",
      "使用AsyncAPI Generator从规范生成文档、代码和模拟服务器，并使用AsyncAPI Studio验证规范"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Servers",
      "Channels",
      "Message Schema",
      "Protocol Bindings",
      "Generate Validate"
    ],
    "viz_labels_zh": [
      "服务器",
      "通道",
      "消息模式",
      "协议绑定",
      "生成验证"
    ],
    "related": [
      "webhook-pattern",
      "openapi-specification",
      "grpc-protocol-buffers"
    ],
    "tags": [
      "asyncapi",
      "event-driven",
      "specification",
      "messaging",
      "async"
    ],
    "origin_author": "Fran Mendez, 2017",
    "origin_source": "AsyncAPI Specification (asyncapi.com); conceptually extends patterns in Building Microservices (Sam Newman, 2nd ed., 2021, Ch. 4)",
    "origin_source_zh": "AsyncAPI规范（asyncapi.com）；概念上扩展了《构建微服务》（Sam Newman，第2版，2021，第4章）中的模式",
    "complexity": "intermediate",
    "when_to_use": [
      "When documenting event-driven architectures that use message brokers (Kafka, RabbitMQ, MQTT, NATS)",
      "When multiple teams produce and consume events and need a shared contract for message schemas",
      "When generating type-safe consumer and producer code from a specification to prevent schema drift",
      "When building developer portals for event-driven APIs that need the same discoverability as REST APIs have with OpenAPI"
    ],
    "when_to_use_zh": [
      "当文档化使用消息代理（Kafka、RabbitMQ、MQTT、NATS）的事件驱动架构时",
      "当多个团队产生和消费事件，需要消息模式的共享契约时",
      "当从规范生成类型安全的消费者和生产者代码以防止模式漂移时",
      "当构建事件驱动API的开发者门户，需要与OpenAPI为REST API提供的相同可发现性时"
    ],
    "core_concepts": [
      "Channels: Named communication paths (Kafka topics, AMQP queues, WebSocket paths) where messages are published and consumed",
      "Operations: Publish and subscribe semantics defined per channel, clarifying whether an application sends or receives messages on each channel",
      "Message Schemas: Strongly-typed message definitions using JSON Schema or Avro, with headers, payload, and optional correlation IDs for tracing",
      "Protocol Bindings: Protocol-specific configurations (Kafka consumer groups, MQTT QoS, AMQP exchanges) that extend the generic specification with implementation details",
      "Code Generation: AsyncAPI Generator produces documentation, client/server code, and mock brokers from the specification, similar to OpenAPI tooling for REST"
    ],
    "core_concepts_zh": [
      "通道：命名的通信路径（Kafka主题、AMQP队列、WebSocket路径），消息在此发布和消费",
      "操作：每个通道定义的发布和订阅语义，明确应用在每个通道上是发送还是接收消息",
      "消息模式：使用JSON Schema或Avro的强类型消息定义，包含头部、负载和用于追踪的可选关联ID",
      "协议绑定：协议特定配置（Kafka消费者组、MQTT QoS、AMQP交换器），用实现细节扩展通用规范",
      "代码生成：AsyncAPI Generator从规范生成文档、客户端/服务端代码和模拟代理，类似于REST的OpenAPI工具"
    ],
    "timeline": [
      [
        "2017",
        "Fran Mendez creates AsyncAPI 1.0 to fill the gap of standardized event-driven API documentation"
      ],
      [
        "2019",
        "AsyncAPI 2.0 released with protocol bindings, multiple server support, and improved message schemas"
      ],
      [
        "2021",
        "AsyncAPI Initiative joins the Linux Foundation, gaining industry-wide governance and backing"
      ],
      [
        "2023",
        "AsyncAPI 3.0 released with major redesign: operations decoupled from channels, reply patterns, and improved reusability"
      ],
      [
        "2025",
        "AsyncAPI adoption grows in AI event pipelines for documenting LLM streaming, agent communication, and model inference events"
      ]
    ],
    "timeline_zh": [
      [
        "2017",
        "Fran Mendez创建AsyncAPI 1.0，填补标准化事件驱动API文档的空白"
      ],
      [
        "2019",
        "AsyncAPI 2.0发布，带来协议绑定、多服务器支持和改进的消息模式"
      ],
      [
        "2021",
        "AsyncAPI倡议加入Linux基金会，获得全行业治理和支持"
      ],
      [
        "2023",
        "AsyncAPI 3.0发布，进行重大重设计：操作与通道解耦、回复模式和改进的可复用性"
      ],
      [
        "2025",
        "AsyncAPI在AI事件管线中的采用增长，用于文档化LLM流式传输、代理通信和模型推理事件"
      ]
    ],
    "dos": [
      "Do adopt a spec-first approach where the AsyncAPI document is written before implementing producers and consumers because it serves as the team contract",
      "Do define protocol bindings explicitly because generic channel descriptions lose critical details like partition strategies and QoS levels",
      "Do version your AsyncAPI specs and use schema registries (Confluent Schema Registry, Apicurio) because event schema evolution needs the same rigor as REST API versioning",
      "Do generate documentation alongside code because event-driven APIs are harder to discover than REST endpoints without proper documentation"
    ],
    "dos_zh": [
      "采用规范优先方法，在实现生产者和消费者之前编写AsyncAPI文档，因为它作为团队契约",
      "明确定义协议绑定，因为通用的通道描述会丢失分区策略和QoS级别等关键细节",
      "版本化AsyncAPI规范并使用模式注册中心（Confluent Schema Registry、Apicurio），因为事件模式演进需要与REST API版本管理相同的严谨性",
      "在代码同时生成文档，因为事件驱动API比REST端点更难发现，需要适当的文档支持"
    ],
    "donts": [
      "Don't use AsyncAPI to describe synchronous request-response APIs because OpenAPI is the appropriate standard for that pattern",
      "Don't skip message schema definitions because untyped event payloads cause silent consumer failures when producers evolve",
      "Don't ignore channel naming conventions because inconsistent topic/queue names create confusion across teams and environments",
      "Don't treat AsyncAPI as documentation only because its greatest value is in code generation and contract testing for event-driven systems"
    ],
    "donts_zh": [
      "不要使用AsyncAPI描述同步请求-响应API，因为OpenAPI是该模式的适当标准",
      "不要跳过消息模式定义，因为无类型的事件负载在生产者演进时会导致消费者静默失败",
      "不要忽略通道命名约定，因为不一致的主题/队列名称会在团队和环境间造成混乱",
      "不要仅将AsyncAPI视为文档，因为其最大价值在于事件驱动系统的代码生成和契约测试"
    ],
    "case_study_company": "Slack",
    "case_study": "Slack adopted AsyncAPI to document its Events API and real-time messaging infrastructure that delivers billions of events daily. Before AsyncAPI, consumer teams relied on wiki pages and tribal knowledge to understand event schemas for workspace events (message.posted, channel.created, reaction.added). By standardizing on AsyncAPI 2.0 with JSON Schema payloads, Slack enabled automatic generation of type-safe event handlers in multiple languages and reduced event integration onboarding time for partner developers from weeks to days.",
    "case_study_zh": "Slack采用AsyncAPI来文档化其每天投递数十亿事件的Events API和实时消息基础设施。在采用AsyncAPI之前，消费者团队依赖wiki页面和口口相传来理解工作区事件（message.posted、channel.created、reaction.added）的事件模式。通过标准化使用AsyncAPI 2.0和JSON Schema负载，Slack实现了多语言类型安全事件处理器的自动生成，并将合作伙伴开发者的事件集成上手时间从数周缩短至数天。",
    "when_not_to_use": [
      "Synchronous REST or GraphQL APIs where OpenAPI or the GraphQL schema already serves as the specification",
      "Simple webhook integrations where a lightweight payload schema is sufficient without full AsyncAPI ceremony",
      "Internal services with a single producer and consumer where direct schema sharing is simpler",
      "Early prototyping phases where event schemas change so rapidly that maintaining the spec creates friction"
    ],
    "when_not_to_use_zh": [
      "同步REST或GraphQL API，OpenAPI或GraphQL模式已作为规范",
      "简单的webhook集成，轻量的负载模式已足够，不需要完整的AsyncAPI仪式",
      "只有单个生产者和消费者的内部服务，直接模式共享更简单",
      "早期原型阶段，事件模式变化太快，维护规范反而产生摩擦"
    ],
    "adopters": [
      "Slack",
      "Adidas",
      "SAP",
      "Salesforce",
      "eBay"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "usability",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Mendez, F. (2017). \"AsyncAPI Specification\". asyncapi.com.",
    "secondary_sources": [
      "Newman, S. (2021). \"Building Microservices\", 2nd ed., Ch. 4. O'Reilly Media.",
      "Hohpe, G. & Woolf, B. (2003). \"Enterprise Integration Patterns\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "webhook-pattern",
        "type": "complement"
      },
      {
        "slug": "openapi-specification",
        "type": "complement"
      },
      {
        "slug": "grpc-protocol-buffers",
        "type": "alternative"
      }
    ]
  },
  {
    "id": 210,
    "name": "API-First Design",
    "name_zh": "API优先设计",
    "slug": "api-first-design",
    "category": "api",
    "desc": "Design the API contract before implementation using a Swagger-first workflow",
    "desc_zh": "在实现之前先设计API契约，采用Swagger优先工作流",
    "steps": [
      "Define the API contract in OpenAPI (Swagger) or AsyncAPI specification before writing any implementation code, treating the spec as the single source of truth",
      "Review and validate the contract with all stakeholders — frontend, backend, QA, and product — using mock servers generated from the spec",
      "Generate server stubs and client SDKs from the specification so that implementation is constrained to match the agreed contract",
      "Set up contract linting rules (Spectral, Redocly) and integrate spec validation into CI to prevent unauthorized contract drift",
      "Publish the finalized spec to the developer portal and proceed with parallel frontend and backend implementation against the shared contract"
    ],
    "steps_zh": [
      "在编写任何实现代码之前，先用OpenAPI（Swagger）或AsyncAPI规范定义API契约，将规范视为唯一真实来源",
      "使用从规范生成的模拟服务器，与所有利益相关方——前端、后端、QA和产品——评审并验证契约",
      "从规范生成服务端桩代码和客户端SDK，使实现受约束以符合已商定的契约",
      "设置契约代码检查规则（Spectral、Redocly）并将规范验证集成到CI中，防止未经授权的契约偏移",
      "将最终规范发布到开发者门户，并开始基于共享契约并行进行前端和后端实现"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "API Contract",
      "Stakeholder Review",
      "Generate Stubs",
      "Contract Lint",
      "Parallel Impl"
    ],
    "viz_labels_zh": [
      "API契约",
      "评审",
      "生成桩代码",
      "契约检查",
      "并行实现"
    ],
    "related": [
      "openapi-specification",
      "consumer-driven-contracts",
      "bff-pattern"
    ],
    "tags": [
      "api-first",
      "swagger",
      "openapi",
      "contract",
      "design"
    ],
    "origin_author": "Kin Lane (API Evangelist), popularized by Twilio and Stripe engineering blogs, ~2012",
    "origin_source": "API-First Design principles; OpenAPI Initiative (openapis.org); API Design Patterns (JJ Geewax, 2021, Ch. 2)",
    "origin_source_zh": "API优先设计原则；OpenAPI倡议（openapis.org）；「API设计模式」（JJ Geewax，2021，第2章）",
    "complexity": "intermediate",
    "when_to_use": [
      "When multiple teams (frontend, mobile, backend) need to work in parallel and cannot afford sequential dependency",
      "When building a public or partner API where the contract must be stable and well-documented before any consumer integrates",
      "When the organization wants to treat APIs as products with versioned, discoverable contracts rather than implementation byproducts",
      "When integrating with third-party systems that require a formalized API spec for compliance or procurement approval"
    ],
    "when_to_use_zh": [
      "当多个团队（前端、移动端、后端）需要并行工作且无法承受顺序依赖时",
      "当构建公共或合作伙伴API时，契约必须在任何消费者集成之前稳定且有良好文档",
      "当组织希望将API视为具有版本化、可发现契约的产品，而不是实现的副产品时",
      "当与需要正式API规范以满足合规或采购审批要求的第三方系统集成时"
    ],
    "core_concepts": [
      "Contract as Source of Truth: The OpenAPI or AsyncAPI spec is the canonical definition; implementation must conform to the spec, not vice versa",
      "Spec-First Workflow: The spec is authored, reviewed, and agreed upon before a single line of implementation code is written",
      "Mock-Driven Development: Mock servers generated from the spec allow consumer teams to develop and test against a realistic API before the backend is ready",
      "Design-Time Linting: Automated spec validators (Spectral, Redocly) enforce style guides and breaking-change rules as part of the CI pipeline",
      "Parallel Development: Frontend and backend teams can work simultaneously once the contract is frozen, reducing end-to-end delivery time"
    ],
    "core_concepts_zh": [
      "契约即真实来源：OpenAPI或AsyncAPI规范是规范性定义；实现必须符合规范，而不是反过来",
      "规范优先工作流：在编写任何一行实现代码之前，先编写、评审并商定规范",
      "模拟驱动开发：从规范生成的模拟服务器允许消费者团队在后端就绪之前进行开发和测试",
      "设计时代码检查：自动规范验证器（Spectral、Redocly）作为CI流水线的一部分执行风格指南和破坏性变更规则",
      "并行开发：一旦契约冻结，前端和后端团队可以同时工作，减少端到端交付时间"
    ],
    "timeline": [
      [
        "2011",
        "Swagger specification created by Tony Tam at Wordnik, providing the first widely-adopted machine-readable API description format"
      ],
      [
        "2015",
        "OpenAPI Initiative formed under the Linux Foundation; Swagger 2.0 donated and renamed to OpenAPI Specification"
      ],
      [
        "2017",
        "OpenAPI 3.0 released with improved support for callbacks, links, and multiple servers, solidifying API-First tooling"
      ],
      [
        "2019",
        "Stripe and Twilio publicly champion API-First design; Stripe's API changelog becomes a model for contract governance"
      ],
      [
        "2023",
        "AI-assisted API design tools (Postman, Stoplight, Speakeasy) automate spec generation and SDK synthesis from natural language"
      ]
    ],
    "timeline_zh": [
      [
        "2011",
        "Tony Tam在Wordnik创建Swagger规范，提供第一个被广泛采用的机器可读API描述格式"
      ],
      [
        "2015",
        "OpenAPI倡议在Linux基金会下成立；Swagger 2.0被捐赠并更名为OpenAPI规范"
      ],
      [
        "2017",
        "OpenAPI 3.0发布，改进了对回调、链接和多服务器的支持，巩固了API优先工具生态"
      ],
      [
        "2019",
        "Stripe和Twilio公开倡导API优先设计；Stripe的API变更日志成为契约治理的典范"
      ],
      [
        "2023",
        "AI辅助API设计工具（Postman、Stoplight、Speakeasy）自动从自然语言生成规范和SDK"
      ]
    ],
    "dos": [
      "Do treat the spec as a living document under version control so that changes are reviewed, approved, and tracked like code changes",
      "Do generate mock servers automatically from the spec so that consumer teams can integrate and test without waiting for backend completion",
      "Do enforce spec linting in CI with tools like Spectral to catch style violations and breaking changes before they merge",
      "Do involve consumers (frontend, mobile, external partners) in spec review because they surface usability issues before implementation locks them in"
    ],
    "dos_zh": [
      "将规范视为版本控制下的活文档，以便像代码变更一样对变更进行评审、审批和跟踪",
      "从规范自动生成模拟服务器，使消费者团队无需等待后端完成即可集成和测试",
      "在CI中使用Spectral等工具强制执行规范代码检查，在合并前捕获风格违规和破坏性变更",
      "让消费者（前端、移动端、外部合作伙伴）参与规范评审，因为他们能在实现将问题锁定之前发现可用性问题"
    ],
    "donts": [
      "Don't write the spec after the implementation because it becomes a documentation exercise that never accurately reflects the actual API behavior",
      "Don't freeze the spec too early without consumer input because a contract that doesn't reflect actual client needs forces workarounds",
      "Don't skip design-time validation because undocumented edge cases in the spec propagate silently into generated SDKs and client code",
      "Don't treat API-First as a heavyweight process requiring formal approval gates for every minor change — use automated linting instead"
    ],
    "donts_zh": [
      "不要在实现之后编写规范，因为这变成了一个永远无法准确反映实际API行为的文档练习",
      "不要在没有消费者输入的情况下过早冻结规范，因为不反映实际客户端需求的契约会迫使采用变通方法",
      "不要跳过设计时验证，因为规范中未记录的边界情况会悄悄传播到生成的SDK和客户端代码中",
      "不要将API优先视为每次小变更都需要正式审批门控的重量级流程——改用自动化代码检查"
    ],
    "case_study_company": "Stripe",
    "case_study": "Stripe built its entire developer platform around an API-First philosophy, maintaining a machine-readable OpenAPI specification as the source of truth for all its payment APIs. Every SDK (Python, Ruby, Node.js, Java, Go, .NET) is generated directly from the spec, ensuring that documentation, code samples, and client libraries are always synchronized with the live API. When Stripe introduces a new endpoint or modifies a resource shape, the change flows automatically through the spec into regenerated SDKs and the developer documentation, eliminating the documentation lag that plagues most API providers.",
    "case_study_zh": "Stripe围绕API优先理念构建了整个开发者平台，维护机器可读的OpenAPI规范作为所有支付API的真实来源。每个SDK（Python、Ruby、Node.js、Java、Go、.NET）都直接从规范生成，确保文档、代码示例和客户端库始终与线上API同步。当Stripe引入新端点或修改资源结构时，变更会自动通过规范流入重新生成的SDK和开发者文档，消除了困扰大多数API提供商的文档滞后问题。",
    "when_not_to_use": [
      "Rapid internal prototyping where the API surface changes multiple times per day and maintaining a spec creates more friction than value",
      "Small single-team projects where the frontend and backend developer are the same person and implicit coordination is sufficient",
      "Legacy system wrapping where the API shape is dictated entirely by existing database or service contracts with no design latitude",
      "Exploratory research APIs where the goal is to discover the right API shape through iteration rather than upfront specification"
    ],
    "when_not_to_use_zh": [
      "快速内部原型设计，API接口每天变化多次，维护规范带来的摩擦大于价值",
      "小型单团队项目，前后端开发者是同一个人且隐式协调已经足够",
      "遗留系统包装，API形状完全由现有数据库或服务契约决定，没有设计自由度",
      "探索性研究API，目标是通过迭代而不是预先规范来发现正确的API形状"
    ],
    "adopters": [
      "Stripe",
      "Twilio",
      "Adyen",
      "Shopify",
      "Box"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "usability",
      "maintainability",
      "testability"
    ],
    "maturity_ring": "established",
    "primary_source": "OpenAPI Initiative (2015). \"OpenAPI Specification\". openapis.org.",
    "secondary_sources": [
      "Geewax, J.J. (2021). \"API Design Patterns\", Ch. 2. Manning Publications.",
      "Sturgeon, P. (2022). \"Designing APIs with Swagger and OpenAPI\". Manning Publications.",
      "Wildermuth, S. (2020). \"API-First development with OpenAPI\". Pluralsight."
    ],
    "typed_relations": [
      {
        "slug": "openapi-specification",
        "type": "complement"
      },
      {
        "slug": "consumer-driven-contracts",
        "type": "complement"
      },
      {
        "slug": "bff-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 211,
    "name": "API Pagination Patterns",
    "name_zh": "API分页模式",
    "slug": "api-pagination-patterns",
    "category": "api",
    "desc": "Cursor-based, offset-based, and keyset pagination strategies for large collection APIs",
    "desc_zh": "针对大型集合API的游标分页、偏移分页和键集分页策略",
    "steps": [
      "Analyze the use case: choose offset pagination for random-access navigation (page numbers, jump to page), cursor-based for real-time feeds, and keyset for high-performance large datasets",
      "For offset pagination, expose page and page_size (or limit/offset) query parameters and return total_count and total_pages in the response envelope",
      "For cursor-based pagination, encode the position as an opaque cursor (base64-encoded pointer), returning next_cursor and prev_cursor in the response so clients never construct cursors manually",
      "For keyset pagination, use the last seen value of a stable indexed column (created_at + id) as the continuation token, avoiding the performance cliff of deep OFFSET queries",
      "Document the pagination model in the OpenAPI spec, include Link headers (RFC 5988) for discoverability, and enforce a maximum page_size limit to prevent abuse"
    ],
    "steps_zh": [
      "分析用例：为随机访问导航（页码、跳转到页面）选择偏移分页，为实时信息流选择游标分页，为高性能大数据集选择键集分页",
      "对于偏移分页，暴露page和page_size（或limit/offset）查询参数，并在响应信封中返回total_count和total_pages",
      "对于游标分页，将位置编码为不透明游标（base64编码的指针），在响应中返回next_cursor和prev_cursor，使客户端无需手动构造游标",
      "对于键集分页，使用稳定索引列（created_at + id）的最后可见值作为延续令牌，避免深层OFFSET查询的性能悬崖",
      "在OpenAPI规范中记录分页模型，包含Link头（RFC 5988）以提高可发现性，并强制执行最大page_size限制以防止滥用"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Offset Pagination",
      "Cursor Pagination",
      "Keyset Pagination",
      "Response Envelope",
      "Spec Doc"
    ],
    "viz_labels_zh": [
      "偏移分页",
      "游标分页",
      "键集分页",
      "响应封装",
      "文档规范"
    ],
    "related": [
      "graphql-schema-design",
      "openapi-specification"
    ],
    "tags": [
      "pagination",
      "cursor",
      "offset",
      "keyset",
      "collections"
    ],
    "origin_author": "Relay Cursor Connections Specification (Facebook/Meta, 2015); keyset pagination popularized by Markus Winand (Use The Index, Luke)",
    "origin_source": "Relay GraphQL Cursor Connections Spec (relay.dev); Winand, M. \"Pagination Done the Right Way\" (use-the-index-luke.com); API Design Patterns (JJ Geewax, 2021, Ch. 9)",
    "origin_source_zh": "Relay GraphQL游标连接规范（relay.dev）；Winand, M.「正确实现分页」（use-the-index-luke.com）；「API设计模式」（JJ Geewax，2021，第9章）",
    "complexity": "intermediate",
    "when_to_use": [
      "When a collection endpoint can return thousands or millions of records and the client needs to retrieve them in batches",
      "When building real-time or near-real-time feeds where new items are inserted at the top and offset pagination causes duplicates or skips",
      "When performance profiling shows that deep OFFSET queries (page 500+) are scanning millions of rows and degrading database performance",
      "When building a public API that serves many diverse clients with different pagination needs (some need page numbers, others need infinite scroll)"
    ],
    "when_to_use_zh": [
      "当集合端点可能返回数千或数百万条记录，客户端需要批量检索时",
      "当构建实时或近实时信息流时，新条目插入顶部，偏移分页会导致重复或跳过",
      "当性能分析显示深层OFFSET查询（第500页以上）正在扫描数百万行并降低数据库性能时",
      "当构建为具有不同分页需求的多种客户端提供服务的公共API时（有些需要页码，有些需要无限滚动）"
    ],
    "core_concepts": [
      "Offset Pagination: Simple limit/offset or page/page_size model; easy to implement and supports random access but degrades at high page numbers due to full table scans",
      "Cursor-Based Pagination: An opaque server-generated pointer encodes the position in the result set; stable across inserts/deletes but does not support random page access",
      "Keyset Pagination: Uses the value of an indexed column as the continuation condition (WHERE created_at < :last_seen); O(log n) performance regardless of depth",
      "Relay Connection Specification: Standardized cursor pagination for GraphQL with edges, nodes, pageInfo, and startCursor/endCursor fields enabling universal client support",
      "Hypermedia Links: RFC 5988 Link headers (rel=next, rel=prev, rel=first, rel=last) allow clients to navigate pages without constructing URLs, supporting HATEOAS principles"
    ],
    "core_concepts_zh": [
      "偏移分页：简单的limit/offset或page/page_size模型；易于实现且支持随机访问，但在高页码时因全表扫描而性能下降",
      "游标分页：不透明的服务器生成指针编码结果集中的位置；在插入/删除操作中稳定，但不支持随机页面访问",
      "键集分页：使用索引列的值作为延续条件（WHERE created_at < :last_seen）；无论深度如何，性能均为O(log n)",
      "Relay连接规范：GraphQL的标准化游标分页，包含edges、nodes、pageInfo以及startCursor/endCursor字段，支持通用客户端",
      "超媒体链接：RFC 5988 Link头（rel=next、rel=prev、rel=first、rel=last）允许客户端在不构造URL的情况下导航页面，支持HATEOAS原则"
    ],
    "timeline": [
      [
        "2005",
        "Offset-based pagination becomes the default pattern for web APIs as SQL LIMIT/OFFSET becomes universally supported"
      ],
      [
        "2012",
        "Twitter introduces cursor-based pagination in its REST API to solve feed consistency problems with offset pagination"
      ],
      [
        "2015",
        "Facebook's Relay framework publishes the Cursor Connection Specification, standardizing cursor pagination for GraphQL"
      ],
      [
        "2016",
        "Markus Winand's 'Pagination Done the Right Way' popularizes keyset (seek method) pagination for high-performance APIs"
      ],
      [
        "2021",
        "GitHub adopts cursor-based pagination across its REST and GraphQL APIs; Stripe uses keyset pagination for its list endpoints"
      ]
    ],
    "timeline_zh": [
      [
        "2005",
        "基于偏移的分页成为Web API的默认模式，随着SQL LIMIT/OFFSET获得普遍支持"
      ],
      [
        "2012",
        "Twitter在其REST API中引入游标分页，以解决偏移分页的信息流一致性问题"
      ],
      [
        "2015",
        "Facebook的Relay框架发布游标连接规范，为GraphQL标准化游标分页"
      ],
      [
        "2016",
        "Markus Winand的「正确实现分页」推广了键集（寻道方法）分页，用于高性能API"
      ],
      [
        "2021",
        "GitHub在其REST和GraphQL API中采用游标分页；Stripe对其列表端点使用键集分页"
      ]
    ],
    "dos": [
      "Do choose the pagination strategy based on the query pattern: offset for admin UIs with page jumps, cursor for feeds, keyset for large-scale data exports",
      "Do return a consistent envelope with metadata (total_count, has_next_page, next_cursor) so clients can build pagination controls without extra requests",
      "Do enforce a maximum page_size limit (e.g., 100 or 1000) because unbounded page sizes allow clients to accidentally trigger full table scans",
      "Do make cursors opaque (base64-encoded) so that clients treat them as black boxes and you can change their internal format without a breaking change"
    ],
    "dos_zh": [
      "根据查询模式选择分页策略：对有页面跳转的管理UI使用偏移，对信息流使用游标，对大规模数据导出使用键集",
      "返回包含元数据（total_count、has_next_page、next_cursor）的一致信封，使客户端无需额外请求即可构建分页控件",
      "强制执行最大page_size限制（如100或1000），因为无界页面大小允许客户端意外触发全表扫描",
      "使游标不透明（base64编码），使客户端将其视为黑盒，便于在不产生破坏性变更的情况下更改其内部格式"
    ],
    "donts": [
      "Don't use offset pagination for large datasets where users never access deep pages because the performance cost is paid even for unused pages",
      "Don't expose internal database row IDs or timestamps as pagination cursors because it leaks implementation details and makes cursor format migration impossible",
      "Don't return inconsistent total_count values when the dataset is modified concurrently because it causes client-side pagination controls to show incorrect totals",
      "Don't skip pagination on endpoints that return collections because unbounded responses will eventually cause timeouts or memory exhaustion"
    ],
    "donts_zh": [
      "对于用户从不访问深层页面的大型数据集，不要使用偏移分页，因为即使对于未使用的页面也要付出性能代价",
      "不要将内部数据库行ID或时间戳作为分页游标暴露，因为这泄露了实现细节并使游标格式迁移变得不可能",
      "当数据集被并发修改时不要返回不一致的total_count值，因为这会导致客户端分页控件显示不正确的总数",
      "不要跳过返回集合的端点的分页，因为无界响应最终会导致超时或内存耗尽"
    ],
    "case_study_company": "Stripe",
    "case_study": "Stripe's list APIs use cursor-based pagination with a keyset implementation under the hood. Every list endpoint (charges, customers, invoices) returns a data array, a has_more boolean, and a next page cursor encoded as the ID of the last object in the response. When fetching the next page, clients pass starting_after=<last_id>, and Stripe executes a WHERE id > :last_id query on an indexed primary key, delivering consistent sub-millisecond pagination performance regardless of whether the client is on page 1 or page 10,000. This design has scaled to handle hundreds of millions of objects per account without any degradation.",
    "case_study_zh": "Stripe的列表API使用游标分页，底层采用键集实现。每个列表端点（charge、customer、invoice）返回一个data数组、一个has_more布尔值和编码为响应中最后一个对象ID的下一页游标。获取下一页时，客户端传递starting_after=<last_id>，Stripe在索引主键上执行WHERE id > :last_id查询，无论客户端在第1页还是第10000页，都能提供稳定的亚毫秒级分页性能。这种设计已扩展到处理每个账户数亿个对象而不会产生任何性能下降。",
    "when_not_to_use": [
      "Small collections of fewer than a hundred items where full-list responses are acceptable and pagination adds unnecessary complexity",
      "Search results that require total hit counts and facet aggregations, where specialized search engines (Elasticsearch) handle pagination differently",
      "Streaming endpoints where data is consumed as a continuous flow rather than discrete pages",
      "Internal batch jobs that read entire tables, where streaming cursors or database-native bulk export are more appropriate"
    ],
    "when_not_to_use_zh": [
      "少于一百条的小型集合，全列表响应是可接受的，分页增加了不必要的复杂性",
      "需要总命中数和分面聚合的搜索结果，专业搜索引擎（Elasticsearch）以不同方式处理分页",
      "数据以连续流而非离散页面消费的流式端点",
      "读取整个表的内部批处理作业，流式游标或数据库原生批量导出更为合适"
    ],
    "adopters": [
      "Stripe",
      "GitHub",
      "Twitter",
      "Shopify",
      "Salesforce"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "performance",
      "usability",
      "scalability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Relay Team, Facebook (2015). \"GraphQL Cursor Connections Specification\". relay.dev.",
    "secondary_sources": [
      "Winand, M. (2016). \"Pagination Done the Right Way\". use-the-index-luke.com.",
      "Geewax, J.J. (2021). \"API Design Patterns\", Ch. 9. Manning Publications.",
      "IETF (2010). \"RFC 5988: Web Linking\". tools.ietf.org."
    ],
    "typed_relations": [
      {
        "slug": "graphql-schema-design",
        "type": "complement"
      },
      {
        "slug": "openapi-specification",
        "type": "complement"
      }
    ]
  },
  {
    "id": 212,
    "name": "API Error Handling Standards",
    "name_zh": "API错误处理标准",
    "slug": "api-error-handling-standards",
    "category": "api",
    "desc": "RFC 7807 Problem Details and structured error responses for consistent API error communication",
    "desc_zh": "RFC 7807问题详情与结构化错误响应，用于一致的API错误通信",
    "steps": [
      "Adopt RFC 7807 Problem Details as the standard error response format, returning application/problem+json with type, title, status, detail, and instance fields",
      "Create a canonical error type registry mapping each error condition to a unique URI-based type identifier so that clients can handle errors programmatically",
      "Map HTTP status codes semantically: 400 for malformed requests, 401 for unauthenticated, 403 for unauthorized, 404 for not found, 409 for conflicts, 422 for validation errors, 429 for rate limits, 500 for server faults",
      "Add machine-readable error extensions to the Problem Details body (field-level validation errors, error codes, trace IDs) while keeping the human-readable detail field concise",
      "Document all error responses in the OpenAPI spec with response schemas, test error scenarios in contract tests, and publish an error catalog in the developer documentation"
    ],
    "steps_zh": [
      "采用RFC 7807问题详情作为标准错误响应格式，返回application/problem+json，包含type、title、status、detail和instance字段",
      "创建规范错误类型注册表，将每种错误条件映射到基于URI的唯一类型标识符，使客户端能够以编程方式处理错误",
      "语义化映射HTTP状态码：400用于格式错误请求，401用于未认证，403用于未授权，404用于未找到，409用于冲突，422用于验证错误，429用于速率限制，500用于服务器故障",
      "向问题详情主体添加机器可读的错误扩展（字段级验证错误、错误代码、追踪ID），同时保持人类可读的detail字段简洁",
      "在OpenAPI规范中记录所有错误响应及响应模式，在契约测试中测试错误场景，并在开发者文档中发布错误目录"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "RFC 7807",
      "Error Registry",
      "HTTP Status Codes",
      "Error Extensions",
      "OpenAPI Spec"
    ],
    "viz_labels_zh": [
      "RFC 7807",
      "错误注册",
      "HTTP状态码",
      "错误扩展",
      "规范文档"
    ],
    "related": [
      "openapi-specification",
      "api-first-design"
    ],
    "tags": [
      "error-handling",
      "rfc-7807",
      "problem-details",
      "http-status",
      "standards"
    ],
    "origin_author": "Mark Nottingham & Erik Wilde, IETF RFC 7807, 2016",
    "origin_source": "IETF RFC 7807: Problem Details for HTTP APIs (Nottingham & Wilde, 2016); API Design Patterns (JJ Geewax, 2021, Ch. 6)",
    "origin_source_zh": "IETF RFC 7807：HTTP API的问题详情（Nottingham & Wilde，2016）；「API设计模式」（JJ Geewax，2021，第6章）",
    "complexity": "beginner",
    "when_to_use": [
      "When building any HTTP API that needs to communicate errors to clients in a consistent, machine-parseable format",
      "When multiple API services need a unified error format so that client error-handling code can be shared across services",
      "When API consumers include non-technical integrators who need clear, actionable error messages to resolve integration problems",
      "When the API is public or partner-facing and error contract stability is as important as the happy-path contract"
    ],
    "when_to_use_zh": [
      "当构建任何需要以一致、机器可解析格式向客户端传达错误的HTTP API时",
      "当多个API服务需要统一错误格式，使客户端错误处理代码可以跨服务共享时",
      "当API消费者包括需要清晰、可操作错误消息来解决集成问题的非技术集成者时",
      "当API是公共的或面向合作伙伴的，错误契约的稳定性与正常路径契约同等重要时"
    ],
    "core_concepts": [
      "RFC 7807 Problem Details: Standardized JSON body for HTTP error responses with type (URI), title (human summary), status (HTTP code), detail (specific explanation), and instance (URI of the occurrence)",
      "Semantic HTTP Status Codes: Using HTTP status codes according to their RFC-defined semantics rather than always returning 200 with an error flag in the body",
      "Machine-Readable Error Codes: Short stable string codes (INVALID_EMAIL_FORMAT, INSUFFICIENT_FUNDS) that clients can switch on for programmatic handling independent of human-readable messages",
      "Validation Error Extensions: Field-level error details (field path, constraint violated, provided value) appended to the Problem Details body as a custom extension",
      "Error Traceability: Including a trace_id or request_id in every error response so that clients can correlate API errors with server-side logs for debugging"
    ],
    "core_concepts_zh": [
      "RFC 7807问题详情：HTTP错误响应的标准化JSON主体，包含type（URI）、title（人类可读摘要）、status（HTTP代码）、detail（具体说明）和instance（发生URI）",
      "语义化HTTP状态码：根据RFC定义的语义使用HTTP状态码，而不是总是返回200并在主体中包含错误标志",
      "机器可读错误代码：客户端可以switch的短稳定字符串代码（INVALID_EMAIL_FORMAT、INSUFFICIENT_FUNDS），用于独立于人类可读消息的程序化处理",
      "验证错误扩展：字段级错误详情（字段路径、违反的约束、提供的值）作为自定义扩展附加到问题详情主体",
      "错误可追溯性：在每个错误响应中包含trace_id或request_id，使客户端能够将API错误与服务器端日志关联以进行调试"
    ],
    "timeline": [
      [
        "2012",
        "Early API error formats are ad-hoc; Google uses {error: {code, message}}, Twitter uses {errors: [{code, message}]}, with no standard"
      ],
      [
        "2016",
        "IETF publishes RFC 7807 'Problem Details for HTTP APIs', providing the first widely-endorsed standard error response format"
      ],
      [
        "2019",
        "Major API frameworks (Spring Boot, ASP.NET Core) add native RFC 7807 support, driving adoption across enterprise APIs"
      ],
      [
        "2022",
        "RFC 9457 updates and obsoletes RFC 7807 with improved extension guidance and new fields for multiple problem instances"
      ],
      [
        "2024",
        "API governance tools (Spectral, Redocly) include RFC 9457 conformance rules as default linting rules for enterprise API standards"
      ]
    ],
    "timeline_zh": [
      [
        "2012",
        "早期API错误格式是临时的；Google使用{error: {code, message}}，Twitter使用{errors: [{code, message}]}，没有标准"
      ],
      [
        "2016",
        "IETF发布RFC 7807「HTTP API的问题详情」，提供第一个广泛认可的标准错误响应格式"
      ],
      [
        "2019",
        "主要API框架（Spring Boot、ASP.NET Core）添加原生RFC 7807支持，推动企业API的采用"
      ],
      [
        "2022",
        "RFC 9457更新并废弃RFC 7807，提供改进的扩展指导和多问题实例的新字段"
      ],
      [
        "2024",
        "API治理工具（Spectral、Redocly）将RFC 9457合规规则作为企业API标准的默认代码检查规则"
      ]
    ],
    "dos": [
      "Do always include a machine-readable error code alongside the human-readable detail so that client code can handle errors programmatically without parsing strings",
      "Do include a trace_id in every error response so that on-call engineers can correlate client-reported errors with distributed traces in seconds",
      "Do return field-level validation errors as structured extensions (not a flat string) so that form clients can display errors next to the relevant input",
      "Do document every error response in the OpenAPI spec so that SDK generators produce typed error classes and clients know what to expect"
    ],
    "dos_zh": [
      "始终在人类可读的detail旁边包含机器可读的错误代码，使客户端代码可以以编程方式处理错误而无需解析字符串",
      "在每个错误响应中包含trace_id，使值班工程师能够在几秒内将客户端报告的错误与分布式追踪关联",
      "将字段级验证错误作为结构化扩展（而非平面字符串）返回，使表单客户端能够在相关输入旁边显示错误",
      "在OpenAPI规范中记录每个错误响应，使SDK生成器产生类型化错误类，客户端知道预期的内容"
    ],
    "donts": [
      "Don't return HTTP 200 with an error flag in the body because it breaks HTTP-layer tools (CDNs, proxies, monitoring) that rely on status codes for routing and alerting",
      "Don't expose internal stack traces or database error messages in API error responses because they reveal exploitable implementation details",
      "Don't use the same error type URI for different error conditions because it forces clients to parse the detail string to distinguish errors",
      "Don't omit error documentation from the OpenAPI spec because undocumented errors are invisible to SDK generators and developer portal users"
    ],
    "donts_zh": [
      "不要在主体中返回HTTP 200并带有错误标志，因为这会破坏依赖状态码进行路由和告警的HTTP层工具（CDN、代理、监控）",
      "不要在API错误响应中暴露内部堆栈跟踪或数据库错误消息，因为它们会泄露可被利用的实现细节",
      "不要对不同的错误条件使用相同的错误类型URI，因为这迫使客户端解析detail字符串来区分错误",
      "不要从OpenAPI规范中省略错误文档，因为未记录的错误对SDK生成器和开发者门户用户是不可见的"
    ],
    "case_study_company": "Zalando",
    "case_study": "Zalando, the European e-commerce platform, published its REST API Guidelines (open-sourced on GitHub) which mandated RFC 7807 Problem Details for all internal and external APIs. By standardizing error responses across hundreds of microservices, Zalando enabled a shared error-handling middleware in its API gateway that automatically enriched errors with trace IDs, correlated them with Zipkin traces, and forwarded structured error metrics to its observability platform. Client teams reported a 60% reduction in integration debugging time because errors were self-describing and immediately traceable.",
    "case_study_zh": "欧洲电商平台Zalando发布了其REST API指南（在GitHub上开源），要求所有内外部API使用RFC 7807问题详情。通过在数百个微服务中标准化错误响应，Zalando在其API网关中启用了共享错误处理中间件，自动用追踪ID丰富错误，将其与Zipkin追踪关联，并将结构化错误指标转发到可观测性平台。客户端团队报告集成调试时间减少了60%，因为错误是自描述的且可立即追踪。",
    "when_not_to_use": [
      "Binary protocols (gRPC, Thrift) that have their own error status code mechanisms and don't use HTTP response bodies for errors",
      "Internal service-to-service communication where both sides share the same codebase and typed exceptions are more expressive than JSON error bodies",
      "Streaming APIs where the error format is dictated by the streaming protocol (SSE, WebSocket) rather than HTTP response status",
      "Legacy APIs where clients already depend on a non-standard error format and migration risk outweighs the benefit of standardization"
    ],
    "when_not_to_use_zh": [
      "二进制协议（gRPC、Thrift）有自己的错误状态码机制，不使用HTTP响应主体来传递错误",
      "内部服务间通信，双方共享同一代码库，类型化异常比JSON错误主体更具表达力",
      "流式API，错误格式由流协议（SSE、WebSocket）而非HTTP响应状态决定",
      "遗留API，客户端已经依赖非标准错误格式，迁移风险大于标准化的收益"
    ],
    "adopters": [
      "Zalando",
      "Microsoft Azure",
      "ASP.NET Core",
      "Spring Boot",
      "Adyen"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "usability",
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Nottingham, M. & Wilde, E. (2016). \"RFC 7807: Problem Details for HTTP APIs\". IETF.",
    "secondary_sources": [
      "IETF (2022). \"RFC 9457: Problem Details for HTTP APIs\" (obsoletes RFC 7807). IETF.",
      "Geewax, J.J. (2021). \"API Design Patterns\", Ch. 6. Manning Publications.",
      "Zalando SE (2022). \"Zalando RESTful API and Event Guidelines\". opensource.zalando.com."
    ],
    "typed_relations": [
      {
        "slug": "openapi-specification",
        "type": "complement"
      },
      {
        "slug": "api-first-design",
        "type": "complement"
      }
    ]
  },
  {
    "id": 213,
    "name": "API Deprecation Lifecycle",
    "name_zh": "API弃用生命周期",
    "slug": "api-deprecation-lifecycle",
    "category": "api",
    "desc": "Sunset headers, versioned migration paths, and deprecation policies for retiring API versions",
    "desc_zh": "用于停用API版本的日落头、版本化迁移路径和弃用策略",
    "steps": [
      "Announce deprecation with a minimum notice period (typically 6-12 months for public APIs) via developer portal, email, and Deprecation + Sunset HTTP headers on every response",
      "Publish a migration guide that maps every deprecated endpoint, field, and behavior to its replacement in the new version, with code examples in all supported languages",
      "Add Deprecation and Sunset headers (IETF RFC 8594) to all deprecated endpoints so that monitoring tools, API gateways, and SDK clients can detect and alert on deprecated usage",
      "Track consumer adoption of the new version using API analytics; proactively reach out to consumers still on the deprecated version as the sunset date approaches",
      "On the sunset date, return HTTP 410 Gone for deprecated endpoints with a Problem Details body linking to the migration guide, and remove the deprecated code after a final grace period"
    ],
    "steps_zh": [
      "通过开发者门户、邮件以及每个响应上的Deprecation + Sunset HTTP头，以最短通知期（公共API通常为6-12个月）宣布弃用",
      "发布迁移指南，将每个已弃用的端点、字段和行为映射到新版本中的替代品，并提供所有支持语言的代码示例",
      "向所有已弃用端点添加Deprecation和Sunset头（IETF RFC 8594），使监控工具、API网关和SDK客户端能够检测并告警已弃用的使用",
      "使用API分析跟踪消费者对新版本的采用；在日落日期临近时，主动联系仍在使用已弃用版本的消费者",
      "在日落日期，对已弃用端点返回HTTP 410 Gone，并在问题详情主体中链接到迁移指南，在最终宽限期后删除已弃用代码"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Deprecation Notice",
      "Migration Guide",
      "Sunset Headers",
      "Traffic Monitor",
      "HTTP 410"
    ],
    "viz_labels_zh": [
      "废弃通知",
      "迁移指南",
      "废弃响应头",
      "流量监控",
      "HTTP 410"
    ],
    "related": [
      "openapi-specification",
      "api-first-design",
      "api-gateway-pattern"
    ],
    "tags": [
      "deprecation",
      "sunset",
      "versioning",
      "lifecycle",
      "migration"
    ],
    "origin_author": "IETF RFC 8594 (Wilde, 2019); deprecation practices pioneered by Stripe, Twilio, and Salesforce",
    "origin_source": "IETF RFC 8594: The Sunset HTTP Header Field (Wilde, 2019); Deprecation HTTP Header Field (Hardt & Wilde, draft-ietf-httpapi-deprecation-header)",
    "origin_source_zh": "IETF RFC 8594：Sunset HTTP头字段（Wilde，2019）；弃用HTTP头字段（Hardt & Wilde，draft-ietf-httpapi-deprecation-header）",
    "complexity": "intermediate",
    "when_to_use": [
      "When introducing a new API version that breaks backward compatibility and existing consumers need time to migrate",
      "When removing a field, endpoint, or behavior that has active consumers and cannot be removed without a coordinated migration",
      "When the organization has a published API versioning policy that commits to minimum deprecation notice periods for external consumers",
      "When building an API platform where consumers include third-party developers who cannot be force-upgraded on the provider's schedule"
    ],
    "when_to_use_zh": [
      "当引入破坏向后兼容性的新API版本，现有消费者需要时间迁移时",
      "当删除有活跃消费者的字段、端点或行为，无法在没有协调迁移的情况下删除时",
      "当组织有已发布的API版本策略，承诺对外部消费者提供最短弃用通知期时",
      "当构建API平台，消费者包括无法按提供商计划强制升级的第三方开发者时"
    ],
    "core_concepts": [
      "Deprecation Header: HTTP response header (Deprecation: <timestamp>) indicating the date a resource was deprecated; clients and monitoring tools can detect it automatically",
      "Sunset Header: RFC 8594 HTTP response header (Sunset: <HTTP-date>) announcing the date after which the resource will be unavailable, giving consumers a concrete deadline",
      "Versioned Migration Paths: Explicit documentation mapping each deprecated element to its replacement across version boundaries, with migration scripts where applicable",
      "Deprecation Notice Period: The minimum time between announcing deprecation and enforcing the sunset — typically 6 months for external APIs, 3 months for internal, and 2 years for major platforms",
      "Consumer Tracking: Using API analytics to identify which consumers are still using deprecated resources so that targeted outreach can drive migration before the sunset date"
    ],
    "core_concepts_zh": [
      "弃用头：HTTP响应头（Deprecation: <timestamp>），指示资源被弃用的日期；客户端和监控工具可以自动检测",
      "日落头：RFC 8594 HTTP响应头（Sunset: <HTTP-date>），宣布资源将不可用的日期，给消费者一个具体的截止日期",
      "版本化迁移路径：将每个已弃用元素映射到跨版本边界的替代品的明确文档，在适用的情况下提供迁移脚本",
      "弃用通知期：宣布弃用和执行日落之间的最短时间——外部API通常为6个月，内部为3个月，主要平台为2年",
      "消费者跟踪：使用API分析来识别哪些消费者仍在使用已弃用资源，以便在日落日期前通过针对性推广来推动迁移"
    ],
    "timeline": [
      [
        "2000",
        "Salesforce introduces API versioning with a multi-year support window, pioneering formal API deprecation for enterprise SaaS"
      ],
      [
        "2014",
        "Stripe publishes its API versioning policy with per-account version pinning, allowing controlled migration without forced upgrades"
      ],
      [
        "2019",
        "IETF publishes RFC 8594 defining the Sunset HTTP header, providing a machine-readable deprecation signal"
      ],
      [
        "2021",
        "IETF draft for Deprecation HTTP header field (draft-ietf-httpapi-deprecation-header) advances, pairing with Sunset for complete lifecycle signaling"
      ],
      [
        "2023",
        "API governance platforms (Stoplight, Postman) integrate deprecation lifecycle management with automated consumer impact analysis"
      ]
    ],
    "timeline_zh": [
      [
        "2000",
        "Salesforce引入具有多年支持窗口的API版本控制，为企业SaaS开创了正式的API弃用"
      ],
      [
        "2014",
        "Stripe发布其API版本策略，采用按账户版本固定，允许受控迁移而无需强制升级"
      ],
      [
        "2019",
        "IETF发布RFC 8594定义Sunset HTTP头，提供机器可读的弃用信号"
      ],
      [
        "2021",
        "弃用HTTP头字段的IETF草案（draft-ietf-httpapi-deprecation-header）推进，与Sunset配合提供完整的生命周期信号"
      ],
      [
        "2023",
        "API治理平台（Stoplight、Postman）集成弃用生命周期管理和自动化消费者影响分析"
      ]
    ],
    "dos": [
      "Do add Deprecation and Sunset headers to every deprecated endpoint response so that automated SDK clients can log warnings without any manual documentation lookup",
      "Do give at least 6 months notice for external API deprecations because third-party developers need time to prioritize and plan migration work",
      "Do publish a specific sunset date rather than 'eventually' because indefinite timelines are ignored and only hard deadlines motivate migration",
      "Do track active consumers of deprecated endpoints using API analytics and send personal outreach to teams still using them 60 days before sunset"
    ],
    "dos_zh": [
      "向每个已弃用端点响应添加Deprecation和Sunset头，使自动化SDK客户端无需任何手动文档查找即可记录警告",
      "为外部API弃用提供至少6个月的通知，因为第三方开发者需要时间来优先考虑和规划迁移工作",
      "发布具体的日落日期而不是「最终」，因为无限期的时间表会被忽视，只有硬截止日期才能推动迁移",
      "使用API分析跟踪已弃用端点的活跃消费者，并在日落前60天向仍在使用它们的团队发送个人联系"
    ],
    "donts": [
      "Don't remove a deprecated endpoint without first verifying through analytics that all consumers have migrated because silent removal causes production outages",
      "Don't use vague deprecation notices ('will be removed in a future version') because they create no urgency and are ignored until production breaks",
      "Don't maintain deprecated endpoints indefinitely without a sunset date because the maintenance burden accumulates and security vulnerabilities are not patched",
      "Don't skip backward-compatible migration paths because consumers cannot migrate if the new version requires rewriting significant amounts of integration code"
    ],
    "donts_zh": [
      "不要在未通过分析验证所有消费者已迁移的情况下删除已弃用端点，因为无声删除会导致生产中断",
      "不要使用模糊的弃用通知（「将在未来版本中删除」），因为它们不产生紧迫感，在生产崩溃之前会被忽视",
      "不要在没有日落日期的情况下无限期维护已弃用端点，因为维护负担会累积，安全漏洞无法修补",
      "不要跳过向后兼容的迁移路径，因为如果新版本需要重写大量集成代码，消费者就无法迁移"
    ],
    "case_study_company": "Salesforce",
    "case_study": "Salesforce manages one of the most complex API deprecation lifecycles in the industry, supporting hundreds of API versions simultaneously for its enterprise customers. Salesforce commits to a minimum three-version support window for its platform APIs — when a version is deprecated, it remains available for three subsequent annual releases before being retired. Every Salesforce API response includes a version header, and the developer portal prominently displays the support matrix with exact end-of-life dates. This policy allowed enterprise customers to plan multi-year migration roadmaps without the risk of unexpected API removals, and is credited with driving Salesforce's customer retention in regulated industries.",
    "case_study_zh": "Salesforce管理着业界最复杂的API弃用生命周期之一，同时为其企业客户支持数百个API版本。Salesforce对其平台API承诺最少三个版本的支持窗口——当一个版本被弃用时，它在被停用之前仍可用于三个后续年度版本。每个Salesforce API响应都包含一个版本头，开发者门户突出显示支持矩阵及确切的生命周期结束日期。这一策略允许企业客户规划多年迁移路线图，而不必担心意外的API删除，并被认为推动了Salesforce在受监管行业的客户留存。",
    "when_not_to_use": [
      "Internal APIs consumed by a single team where direct code changes are faster and less risky than a formal deprecation process",
      "Alpha or beta APIs explicitly labeled as unstable where consumers accept that breaking changes can occur without notice",
      "APIs with no active consumers as tracked by analytics, where immediate removal is safe and more responsible than maintaining dead code",
      "Undocumented private endpoints that were never part of the public API contract and were not intended for consumer use"
    ],
    "when_not_to_use_zh": [
      "由单个团队使用的内部API，直接代码变更比正式弃用流程更快且风险更低",
      "明确标记为不稳定的Alpha或Beta API，消费者接受可能发生无通知的破坏性变更",
      "分析跟踪到没有活跃消费者的API，立即删除是安全且比维护死代码更负责任的",
      "从未属于公共API契约且非供消费者使用的未记录私有端点"
    ],
    "adopters": [
      "Salesforce",
      "Stripe",
      "Twilio",
      "GitHub",
      "Google Cloud"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "usability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Wilde, E. (2019). \"RFC 8594: The Sunset HTTP Header Field\". IETF.",
    "secondary_sources": [
      "Hardt, D. & Wilde, E. (2023). \"The Deprecation HTTP Header Field\". IETF draft-ietf-httpapi-deprecation-header.",
      "Geewax, J.J. (2021). \"API Design Patterns\", Ch. 15. Manning Publications.",
      "Sturgeon, P. (2022). \"Designing APIs with Swagger and OpenAPI\". Manning Publications."
    ],
    "typed_relations": [
      {
        "slug": "openapi-specification",
        "type": "complement"
      },
      {
        "slug": "api-first-design",
        "type": "complement"
      },
      {
        "slug": "api-gateway-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 214,
    "name": "Edge Computing API Patterns",
    "name_zh": "边缘计算API模式",
    "slug": "edge-computing-api-patterns",
    "category": "api",
    "desc": "CDN-edge function design for low-latency API responses and global distribution",
    "desc_zh": "用于低延迟API响应和全球分发的CDN边缘函数设计",
    "steps": [
      "Identify which API handlers are stateless, read-heavy, or geographically latency-sensitive and are candidates for edge deployment (auth token validation, personalization headers, A/B routing, geolocation-based redirects)",
      "Deploy edge functions on a CDN edge platform (Cloudflare Workers, AWS Lambda@Edge, Fastly Compute) that executes JavaScript or Wasm close to the user",
      "Use edge-cacheable response patterns: set appropriate Cache-Control headers, use stale-while-revalidate for non-critical freshness, and vary the cache key on headers like Accept-Language or CF-IPCountry",
      "For dynamic data at the edge, use edge KV stores (Cloudflare KV, Durable Objects) for low-latency reads with eventual consistency, reserving strong-consistency writes for origin servers",
      "Implement edge observability: collect structured logs at the edge, forward traces with injected trace IDs to the origin, and monitor edge cache hit rates and p99 latencies per edge PoP"
    ],
    "steps_zh": [
      "识别哪些API处理程序是无状态的、读密集型的或地理位置上对延迟敏感的，是边缘部署的候选（认证令牌验证、个性化头、A/B路由、基于地理位置的重定向）",
      "在CDN边缘平台（Cloudflare Workers、AWS Lambda@Edge、Fastly Compute）上部署边缘函数，在靠近用户的地方执行JavaScript或Wasm",
      "使用边缘可缓存响应模式：设置适当的Cache-Control头，为非关键新鲜度使用stale-while-revalidate，并在Accept-Language或CF-IPCountry等头上变化缓存键",
      "对于边缘的动态数据，使用边缘KV存储（Cloudflare KV、Durable Objects）进行低延迟读取和最终一致性，将强一致性写操作保留给源服务器",
      "实现边缘可观测性：在边缘收集结构化日志，将带有注入追踪ID的追踪转发到源，并监控每个边缘PoP的边缘缓存命中率和p99延迟"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Edge Candidates",
      "Edge Functions",
      "Cache Patterns",
      "Edge KV Store",
      "Edge Observ"
    ],
    "viz_labels_zh": [
      "边缘候选",
      "边缘函数",
      "缓存策略",
      "边缘KV存储",
      "边缘可观测"
    ],
    "related": [
      "api-gateway-pattern",
      "api-first-design"
    ],
    "tags": [
      "edge-computing",
      "cdn",
      "cloudflare-workers",
      "low-latency",
      "serverless"
    ],
    "origin_author": "Cloudflare Workers (2017), AWS Lambda@Edge (2017), Fastly Compute@Edge (2019)",
    "origin_source": "Cloudflare Workers documentation (developers.cloudflare.com); AWS Lambda@Edge Developer Guide; The Cloud-Native Edge (InfoQ, 2022)",
    "origin_source_zh": "Cloudflare Workers文档（developers.cloudflare.com）；AWS Lambda@Edge开发指南；「云原生边缘」（InfoQ，2022）",
    "complexity": "advanced",
    "when_to_use": [
      "When global users experience high latency because all API traffic routes to a single geographic origin region",
      "When authentication, authorization, or request transformation logic can be executed at the CDN edge to reduce load and latency on origin servers",
      "When building personalization, geolocation routing, or A/B testing that must happen before the response is served without adding a round-trip to the origin",
      "When AI inference at the edge (small models, embeddings) can answer requests without incurring cross-continent latency to a centralized GPU cluster"
    ],
    "when_to_use_zh": [
      "当全球用户因所有API流量路由到单一地理源区域而遇到高延迟时",
      "当认证、授权或请求转换逻辑可以在CDN边缘执行，以减少源服务器的负载和延迟时",
      "当构建必须在响应被提供之前发生的个性化、地理位置路由或A/B测试，而不需要额外的源往返时",
      "当边缘AI推断（小型模型、嵌入）可以在不产生到集中式GPU集群的跨大陆延迟的情况下响应请求时"
    ],
    "core_concepts": [
      "Edge Functions: Lightweight serverless functions deployed to CDN Points of Presence (PoPs) that execute within single-digit milliseconds of the user, before traffic reaches the origin",
      "Edge Caching: Storing API responses at CDN edge nodes with Cache-Control and Surrogate-Control headers to serve repeat requests from cache rather than forwarding to the origin",
      "Edge KV Stores: Eventually-consistent key-value stores co-located with edge functions (Cloudflare KV, AWS DynamoDB Global Tables) for low-latency reads of configuration, feature flags, and session data",
      "Request Routing at the Edge: Using edge functions to implement URL rewrites, A/B testing, geographic routing, and canary deployments without deploying changes to the origin",
      "Cold Start Elimination: CDN edge runtimes (Cloudflare Workers, Fastly Compute) use V8 isolates or Wasm rather than containers, eliminating cold start latency that characterizes traditional serverless functions"
    ],
    "core_concepts_zh": [
      "边缘函数：部署在CDN接入点（PoP）的轻量级无服务器函数，在流量到达源之前在距用户个位数毫秒内执行",
      "边缘缓存：使用Cache-Control和Surrogate-Control头将API响应存储在CDN边缘节点，从缓存提供重复请求而不转发到源",
      "边缘KV存储：与边缘函数共置的最终一致性键值存储（Cloudflare KV、AWS DynamoDB全局表），用于低延迟读取配置、特性标志和会话数据",
      "边缘请求路由：使用边缘函数实现URL重写、A/B测试、地理路由和金丝雀部署，而无需将变更部署到源",
      "消除冷启动：CDN边缘运行时（Cloudflare Workers、Fastly Compute）使用V8隔离器或Wasm而非容器，消除了传统无服务器函数特有的冷启动延迟"
    ],
    "timeline": [
      [
        "2009",
        "Akamai EdgeSuite introduces edge-side includes and logic, pioneering programmable CDN edge for dynamic content"
      ],
      [
        "2017",
        "AWS Lambda@Edge and Cloudflare Workers launched, enabling full JavaScript execution at CDN edge nodes globally"
      ],
      [
        "2019",
        "Fastly Compute@Edge introduces WebAssembly at the edge, enabling polyglot edge functions with near-zero cold starts"
      ],
      [
        "2021",
        "Edge-native frameworks (Next.js Edge Runtime, Remix, SvelteKit) adopt edge functions as first-class deployment targets"
      ],
      [
        "2024",
        "AI inference at the edge (Cloudflare AI, AWS Bedrock at edge) enables sub-100ms LLM completions for simple queries globally"
      ]
    ],
    "timeline_zh": [
      [
        "2009",
        "Akamai EdgeSuite引入边缘侧包含和逻辑，为动态内容开创了可编程CDN边缘"
      ],
      [
        "2017",
        "AWS Lambda@Edge和Cloudflare Workers发布，支持在全球CDN边缘节点执行完整的JavaScript"
      ],
      [
        "2019",
        "Fastly Compute@Edge在边缘引入WebAssembly，支持近零冷启动的多语言边缘函数"
      ],
      [
        "2021",
        "边缘原生框架（Next.js Edge Runtime、Remix、SvelteKit）将边缘函数作为一级部署目标"
      ],
      [
        "2024",
        "边缘AI推断（Cloudflare AI、AWS Bedrock at edge）为全球简单查询实现低于100ms的LLM响应"
      ]
    ],
    "dos": [
      "Do keep edge functions small and stateless because edge runtimes have strict CPU time, memory, and bundle size limits that don't apply to origin servers",
      "Do use edge KV stores for read-heavy configuration data (feature flags, rate limit rules, geo-routing tables) because they provide sub-millisecond reads without origin round-trips",
      "Do propagate trace IDs from edge to origin so that full request traces span both edge and origin processing in your distributed tracing system",
      "Do monitor cache hit rates per edge PoP because a low cache hit rate at the edge means most traffic still reaches the origin, negating the latency benefit"
    ],
    "dos_zh": [
      "保持边缘函数小型且无状态，因为边缘运行时有严格的CPU时间、内存和包大小限制，这些限制不适用于源服务器",
      "对读密集型配置数据（特性标志、速率限制规则、地理路由表）使用边缘KV存储，因为它们提供亚毫秒级读取而无需源往返",
      "将追踪ID从边缘传播到源，使完整的请求追踪在分布式追踪系统中跨越边缘和源处理",
      "监控每个边缘PoP的缓存命中率，因为边缘的低缓存命中率意味着大多数流量仍然到达源，否定了延迟优势"
    ],
    "donts": [
      "Don't put stateful or strongly-consistent operations (database writes, payment processing) in edge functions because edge KV stores are eventually consistent and edge-to-DB latency can exceed origin latency",
      "Don't deploy untested logic to edge functions in production without a gradual rollout because a faulty edge function breaks every user globally, not just in one region",
      "Don't ignore bundle size limits for edge functions because large dependency trees will exceed the edge runtime limits and fail silently at deploy time",
      "Don't use edge functions as a shortcut to avoid fixing origin latency because edge caching only helps for cacheable responses — non-cacheable dynamic APIs still hit the origin"
    ],
    "donts_zh": [
      "不要在边缘函数中放置有状态或强一致性操作（数据库写入、支付处理），因为边缘KV存储是最终一致的，边缘到数据库的延迟可能超过源延迟",
      "不要在没有渐进式发布的情况下将未经测试的逻辑部署到生产边缘函数，因为错误的边缘函数会在全球范围内影响每个用户，而不仅仅是一个区域",
      "不要忽视边缘函数的包大小限制，因为大型依赖树会超过边缘运行时限制，并在部署时静默失败",
      "不要使用边缘函数作为避免修复源延迟的捷径，因为边缘缓存只对可缓存响应有帮助——不可缓存的动态API仍然访问源"
    ],
    "case_study_company": "Cloudflare",
    "case_study": "Cloudflare built its own API gateway and developer platform (Cloudflare Workers, Pages, D1, KV) entirely on its edge network. Cloudflare's API itself uses edge functions for authentication, rate limiting, and routing — all executed within milliseconds of the user across 300+ PoPs worldwide. A benchmark published in 2023 showed that Cloudflare Workers API responses averaged 8ms globally vs. 120ms for equivalent AWS Lambda (us-east-1) responses for users in Asia-Pacific, demonstrating the latency advantage of edge-first API design. The platform serves over 50 million requests per second from the edge with no origin involved.",
    "case_study_zh": "Cloudflare完全在其边缘网络上构建了自己的API网关和开发者平台（Cloudflare Workers、Pages、D1、KV）。Cloudflare自身的API使用边缘函数进行认证、速率限制和路由——所有这些都在全球300多个PoP中在用户毫秒之内执行。2023年发布的一项基准测试显示，对于亚太地区的用户，Cloudflare Workers API响应全球平均为8ms，而等效的AWS Lambda（us-east-1）响应为120ms，展示了边缘优先API设计的延迟优势。该平台每秒处理超过5000万个请求，无需访问源。",
    "when_not_to_use": [
      "APIs that perform complex database transactions or strong-consistency operations that cannot tolerate the eventual-consistency model of edge KV stores",
      "Compute-intensive workloads (video transcoding, ML model training) that require more CPU and memory than edge runtime limits allow",
      "APIs that must comply with data residency regulations requiring all data processing to occur in specific geographic regions",
      "Internal microservice-to-microservice APIs that run within a single data center where network latency is already sub-millisecond"
    ],
    "when_not_to_use_zh": [
      "执行无法容忍边缘KV存储最终一致性模型的复杂数据库事务或强一致性操作的API",
      "需要比边缘运行时限制更多CPU和内存的计算密集型工作负载（视频转码、ML模型训练）",
      "必须遵守数据驻留法规的API，要求所有数据处理发生在特定地理区域",
      "在单个数据中心内运行的内部微服务间API，网络延迟已经是亚毫秒级"
    ],
    "adopters": [
      "Cloudflare",
      "Vercel",
      "Netlify",
      "Shopify",
      "Discord"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "performance",
      "scalability",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Cloudflare (2017). \"Cloudflare Workers: Serverless at the Edge\". developers.cloudflare.com.",
    "secondary_sources": [
      "AWS (2017). \"Lambda@Edge Developer Guide\". docs.aws.amazon.com.",
      "Fastly (2019). \"Compute@Edge: WebAssembly at the Edge\". developer.fastly.com.",
      "Grigorik, I. (2023). \"Edge Computing: The Next Frontier of API Design\". InfoQ."
    ],
    "typed_relations": [
      {
        "slug": "api-gateway-pattern",
        "type": "extends"
      },
      {
        "slug": "api-first-design",
        "type": "complement"
      }
    ]
  },
  {
    "id": 240,
    "name": "API Versioning Strategies",
    "name_zh": "API 版本管理策略",
    "slug": "api-versioning-strategies",
    "category": "api",
    "desc": "URL path, header, and query parameter techniques for evolving APIs without breaking clients",
    "desc_zh": "通过 URL 路径、请求头和查询参数等技术演进 API，同时避免破坏已有客户端",
    "steps": [
      "Choose a versioning scheme based on client type and change frequency: URL path (/v1/resources) for public APIs with broad client diversity, custom request headers (API-Version: 2024-01) for header-aware clients, or query parameters (?version=1) for simple integrations",
      "Establish a version lifecycle policy specifying how long each version will be supported (e.g., two concurrent versions, 12-month deprecation notice) and communicate it clearly in API documentation",
      "Implement version routing at the API gateway or controller layer, mapping version identifiers to the correct handler implementations, keeping business logic version-agnostic where possible",
      "Use semantic versioning semantics to distinguish breaking changes (major version bump) from additive changes (backward-compatible additions do not require a new version) and document the change log per version",
      "Sunset old versions using HTTP 410 Gone responses and Deprecation/Sunset headers, monitor traffic to deprecated endpoints, and notify clients in advance before retiring a version"
    ],
    "steps_zh": [
      "根据客户端类型和变更频率选择版本方案：URL 路径（/v1/resources）适合客户端多样的公共 API；自定义请求头（API-Version: 2024-01）适合支持头部的客户端；查询参数（?version=1）适合简单集成场景",
      "制定版本生命周期策略，明确每个版本的支持时长（如同时维护两个版本、12 个月废弃通知期），并在 API 文档中清晰传达",
      "在 API 网关或控制器层实现版本路由，将版本标识符映射到对应处理实现，尽量保持业务逻辑与版本无关",
      "用语义版本语义区分破坏性变更（主版本递增）和向后兼容的新增内容（无需新版本），并按版本维护变更日志",
      "通过 HTTP 410 Gone 响应和 Deprecation/Sunset 头部废弃旧版本，监控废弃端点的流量，并在下线前提前通知客户端"
    ],
    "ai_relevant": false,
    "viz_type": "timeline",
    "viz_labels": [
      "Version Scheme",
      "Lifecycle Policy",
      "Version Routing",
      "Semantic Versioning",
      "Sunset"
    ],
    "viz_labels_zh": [
      "版本方案",
      "生命周期",
      "版本路由",
      "语义版本",
      "下线策略"
    ],
    "related": [
      "openapi-specification",
      "api-gateway-pattern",
      "expansion-contraction-pattern"
    ],
    "tags": [
      "versioning",
      "backward-compatibility",
      "api-lifecycle",
      "deprecation"
    ],
    "origin_author": "Troy Hunt",
    "origin_source": "Troy Hunt (2014). \"Your API versioning is wrong\". troyhunt.com; REST API Design Rulebook (Mark Masse, O'Reilly 2011)",
    "origin_source_zh": "Troy Hunt（2014）。「你的 API 版本管理是错误的」，troyhunt.com；《REST API 设计规范》（Mark Masse，O'Reilly 2011）",
    "complexity": "intermediate",
    "when_to_use": [
      "When a public API serves third-party developers or partners who cannot be forced to upgrade in lockstep with server releases",
      "When introducing breaking changes (renaming fields, changing response structures, removing endpoints) that would silently corrupt existing client integrations",
      "When different client segments (mobile apps, legacy enterprise systems, new web frontends) need to migrate at different speeds",
      "When regulatory or contractual obligations require a stable, versioned API surface for a defined period"
    ],
    "when_to_use_zh": [
      "当公共 API 服务于第三方开发者或合作伙伴，无法强制其与服务端同步升级时",
      "当引入破坏性变更（重命名字段、改变响应结构、删除端点）可能悄然破坏现有客户端集成时",
      "当不同客户端群体（移动应用、遗留企业系统、新 Web 前端）需要以不同速度迁移时",
      "当法规或合同义务要求在特定期限内提供稳定的、带版本号的 API 接口时"
    ],
    "core_concepts": [
      "URL Path Versioning: Embedding the version in the resource path (/v1/users) makes the version explicit, cache-friendly, and easy to route at the gateway, but couples the version to the URL structure",
      "Header Versioning: Using a custom request header (API-Version: 2024-01-15) or the Accept header (application/vnd.example.v2+json) keeps URLs clean but requires header-aware clients and CDN configurations",
      "Query Parameter Versioning: Adding a version query parameter (?api-version=2) is simple to implement but can pollute cache keys and is less semantically clean than path versioning",
      "Semantic Versioning Contract: Distinguishing breaking changes (require a new version) from additive/backward-compatible changes (can be added to the current version) minimizes unnecessary version proliferation",
      "Sunset Policy: A formal lifecycle policy defining support windows, deprecation notices, and sunset dates ensures clients have predictable migration timelines and avoids surprise API removals"
    ],
    "core_concepts_zh": [
      "URL 路径版本：将版本嵌入资源路径（/v1/users），版本显式可见、便于缓存和网关路由，但与 URL 结构耦合",
      "请求头版本：使用自定义请求头（API-Version: 2024-01-15）或 Accept 头（application/vnd.example.v2+json），保持 URL 整洁，但要求客户端和 CDN 支持头部处理",
      "查询参数版本：通过版本查询参数（?api-version=2）实现，简单直接，但可能污染缓存键，语义上不如路径版本清晰",
      "语义版本契约：区分破坏性变更（需要新版本）与向后兼容的新增内容（可加入当前版本），最大限度减少不必要的版本扩散",
      "废弃策略：正式的生命周期策略定义支持窗口、废弃通知和下线日期，确保客户端有可预期的迁移时间线，避免突然删除 API"
    ],
    "timeline": [
      [
        "2000",
        "Early REST APIs use ad-hoc versioning; Roy Fielding's REST dissertation establishes the foundation for API design"
      ],
      [
        "2011",
        "Stripe launches its date-based API versioning (YYYY-MM-DD) which becomes an industry reference model for SaaS APIs"
      ],
      [
        "2014",
        "Troy Hunt's blog post 'Your API versioning is wrong' sparks widespread debate on URL path vs header versioning"
      ],
      [
        "2021",
        "IETF publishes RFC 8594 (Sunset Header) standardizing HTTP header for announcing API deprecation and retirement dates"
      ]
    ],
    "timeline_zh": [
      [
        "2000",
        "早期 REST API 使用临时版本管理；Roy Fielding 的 REST 论文奠定了 API 设计基础"
      ],
      [
        "2011",
        "Stripe 推出基于日期的 API 版本控制（YYYY-MM-DD），成为 SaaS API 的行业参考模型"
      ],
      [
        "2014",
        "Troy Hunt 的博文「你的 API 版本管理是错误的」引发了 URL 路径与请求头版本方式的广泛讨论"
      ],
      [
        "2021",
        "IETF 发布 RFC 8594（Sunset 头部），标准化了用于宣布 API 废弃和下线日期的 HTTP 头部"
      ]
    ],
    "dos": [
      "Do publish and commit to a versioning lifecycle policy before you launch the first public version, because retrofitting deprecation timelines is much harder than establishing them upfront",
      "Do use the HTTP Deprecation and Sunset response headers on all deprecated endpoints so client SDKs and monitoring tools can detect and surface upcoming retirements automatically",
      "Do version the API at the coarsest granularity that avoids breaking changes — prefer additive evolution within a version over creating a new version for every non-breaking change",
      "Do maintain a versioned changelog and migration guide so clients can understand exactly what changed and how to upgrade"
    ],
    "dos_zh": [
      "在发布第一个公开版本之前制定并承诺版本生命周期策略，因为事后追加废弃时间线远比提前确立更难",
      "在所有已废弃端点上使用 HTTP Deprecation 和 Sunset 响应头，以便客户端 SDK 和监控工具能自动检测并提醒即将到来的下线",
      "以最粗粒度对 API 进行版本化以避免破坏性变更——在版本内优先采用向后兼容的新增演进，而非每次非破坏性变更都创建新版本",
      "维护带版本的变更日志和迁移指南，让客户端清楚了解发生了哪些变化以及如何升级"
    ],
    "donts": [
      "Don't version every minor change — additive, backward-compatible changes (new optional fields, new endpoints) should be added to the current version without bumping the version number",
      "Don't maintain more than two or three simultaneous major versions in production because each version multiplies testing, documentation, and operational burden indefinitely",
      "Don't silently deprecate an API version by simply stopping to maintain it — always communicate deprecation timelines and respond to requests with proper Deprecation headers",
      "Don't mix versioning strategies (URL path in some endpoints, headers in others) within the same API surface — pick one and apply it consistently"
    ],
    "donts_zh": [
      "不要对每个小变更都升版本——向后兼容的新增内容（新的可选字段、新端点）应直接加入当前版本，无需递增版本号",
      "不要同时在生产环境维护超过两三个主版本，因为每个版本都会无限叠加测试、文档和运维负担",
      "不要通过停止维护来悄然废弃某个 API 版本——始终传达废弃时间线，并在响应中添加 Deprecation 头部",
      "不要在同一 API 表面混用多种版本策略（某些端点用 URL 路径，其他用请求头）——选定一种后一致应用"
    ],
    "case_study_company": "Stripe",
    "case_study": "Stripe's API versioning strategy is widely cited as a gold standard. Stripe uses date-based version identifiers (e.g., 2023-10-16) and permanently pins each API key to the version current at the time of the key's creation. This means a developer's integration never breaks due to a Stripe API update — Stripe runs all versions simultaneously and uses automated changelog generation to document every change. As of 2024, Stripe maintains over 10 years of backward-compatible API versions, enabling customers with decade-old integrations to continue operating without modification.",
    "case_study_zh": "Stripe 的 API 版本策略被广泛视为黄金标准。Stripe 采用基于日期的版本标识符（如 2023-10-16），并将每个 API 密钥永久固定到创建时的当前版本。这意味着开发者的集成永远不会因 Stripe 的 API 更新而中断——Stripe 同时运行所有版本，并用自动化变更日志生成来记录每一项变更。截至 2024 年，Stripe 维护着超过 10 年的向后兼容 API 版本，让使用十年前集成的客户无需任何修改即可继续运行。",
    "when_not_to_use": [
      "Internal APIs consumed only by services you own and deploy together — use consumer-driven contract tests instead of versioning to coordinate changes",
      "APIs under active development before any external consumers exist — stabilize the design first, then introduce versioning when you publish",
      "APIs where all consumers can be updated atomically (e.g., a single-page app and its BFF) — prefer feature flags and backward-compatible evolution",
      "GraphQL APIs where field-level deprecation and query flexibility already provide a finer-grained compatibility mechanism than endpoint versioning"
    ],
    "when_not_to_use_zh": [
      "仅由你自己拥有并一同部署的内部 API——使用消费者驱动的契约测试，而非版本管理来协调变更",
      "在任何外部消费者存在之前仍在积极开发中的 API——先稳定设计，再在发布时引入版本管理",
      "所有消费者可以原子性更新的 API（如单页应用及其对应的 BFF）——优先使用功能开关和向后兼容演进",
      "GraphQL API，其字段级废弃和查询灵活性已提供比端点版本管理更细粒度的兼容性机制"
    ],
    "adopters": [
      "Stripe",
      "Twilio",
      "GitHub",
      "Salesforce",
      "Microsoft Azure"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "reliability",
      "usability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Troy Hunt (2014). \"Your API versioning is wrong, which is why I decided to do it 3 different wrong ways\". troyhunt.com.",
    "secondary_sources": [
      "IETF (2021). \"RFC 8594: The Sunset HTTP Header Field\". tools.ietf.org.",
      "Klarna (2022). \"Klarna API Versioning Strategy\". engineering.klarna.com.",
      "Masse, M. (2011). \"REST API Design Rulebook\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "openapi-specification",
        "type": "complement"
      },
      {
        "slug": "api-gateway-pattern",
        "type": "complement"
      },
      {
        "slug": "expansion-contraction-pattern",
        "type": "related"
      }
    ]
  },
  {
    "id": 241,
    "name": "REST Maturity Model",
    "name_zh": "REST 成熟度模型",
    "slug": "rest-maturity-model",
    "category": "api",
    "desc": "Leonard Richardson's four-level model measuring REST compliance from plain HTTP to hypermedia-driven APIs",
    "desc_zh": "Leonard Richardson 的四级模型，衡量 API 从基础 HTTP 到超媒体驱动的 REST 合规程度",
    "steps": [
      "Audit your existing API against the four maturity levels: Level 0 (single URI, all verbs, RPC over HTTP), Level 1 (resource-based URIs), Level 2 (HTTP verbs and status codes), Level 3 (hypermedia/HATEOAS controls)",
      "Advance to Level 1 by decomposing the API into resource-based URIs that reflect domain entities (/orders, /customers/{id}) rather than operation names (/getOrder, /createCustomer)",
      "Advance to Level 2 by using HTTP verbs semantically (GET for reads, POST for create, PUT/PATCH for update, DELETE for removal) and returning appropriate HTTP status codes (201 Created, 404 Not Found, 409 Conflict)",
      "Evaluate whether Level 3 (HATEOAS) is appropriate: add hypermedia links to responses so clients can discover available actions dynamically, reducing client-server coupling and enabling API evolution",
      "Document the target maturity level as an API contract and establish linting rules or contract tests to prevent regression to lower levels during iterative development"
    ],
    "steps_zh": [
      "对照四个成熟度级别审计现有 API：第 0 级（单一 URI、所有动词、HTTP 上的 RPC）、第 1 级（基于资源的 URI）、第 2 级（HTTP 动词和状态码）、第 3 级（超媒体/HATEOAS 控制）",
      "通过将 API 分解为基于资源的 URI（如 /orders、/customers/{id}）而非操作名称（如 /getOrder、/createCustomer），升级到第 1 级",
      "通过语义化使用 HTTP 动词（GET 用于读取、POST 用于创建、PUT/PATCH 用于更新、DELETE 用于删除）并返回适当的 HTTP 状态码（201 Created、404 Not Found、409 Conflict），升级到第 2 级",
      "评估第 3 级（HATEOAS）是否合适：在响应中添加超媒体链接，让客户端动态发现可用操作，降低客户端-服务端耦合并支持 API 演进",
      "将目标成熟度级别记录为 API 契约，并建立代码检查规则或契约测试，防止迭代开发中退回到更低级别"
    ],
    "ai_relevant": false,
    "viz_type": "pyramid",
    "viz_labels": [
      "Level 0 RPC",
      "Level 1 Resources",
      "Level 2 Verbs",
      "Level 3 HATEOAS",
      "Target Level"
    ],
    "viz_labels_zh": [
      "L0 RPC",
      "L1 资源",
      "L2 动词",
      "L3 超媒体",
      "目标级别"
    ],
    "related": [
      "openapi-specification",
      "api-versioning-strategies",
      "api-first-design"
    ],
    "tags": [
      "rest",
      "hateoas",
      "http",
      "api-design",
      "maturity-model"
    ],
    "origin_author": "Leonard Richardson",
    "origin_source": "Leonard Richardson (2008). \"Justice Will Take Us Millions Of Intricate Moves\" (QCon presentation); Martin Fowler (2010). \"Richardson Maturity Model\". martinfowler.com",
    "origin_source_zh": "Leonard Richardson（2008）。「正义将带我们走过数百万复杂步骤」（QCon 演讲）；Martin Fowler（2010）。《Richardson 成熟度模型》，martinfowler.com",
    "complexity": "beginner",
    "when_to_use": [
      "When onboarding a team to REST principles and you need a structured progression framework rather than an all-or-nothing standard",
      "When auditing an existing API to identify specific gaps and prioritize incremental improvements",
      "When negotiating API design standards across multiple teams or squads and you need a shared vocabulary for quality levels",
      "When evaluating third-party APIs for integration suitability based on their REST compliance and discoverability"
    ],
    "when_to_use_zh": [
      "当将团队引入 REST 原则时，需要一个结构化的渐进框架，而非全有或全无的标准",
      "当审计现有 API 以识别具体差距并确定增量改进优先级时",
      "当在多个团队或小组之间协商 API 设计标准，需要共享质量级别词汇时",
      "当基于 REST 合规性和可发现性评估第三方 API 的集成适用性时"
    ],
    "core_concepts": [
      "Level 0 — The Swamp of POX: A single URI used for all operations, typically with XML or JSON payloads describing the operation, equivalent to SOAP or XML-RPC over HTTP",
      "Level 1 — Resources: Multiple URIs each representing a distinct resource, but using only POST/GET regardless of the intended operation",
      "Level 2 — HTTP Verbs: Uses the full HTTP verb set (GET, POST, PUT, PATCH, DELETE) semantically aligned to CRUD operations, and returns meaningful HTTP status codes to convey operation outcomes",
      "Level 3 — Hypermedia Controls (HATEOAS): Responses embed links to related actions and resources, enabling clients to navigate the API entirely through hypermedia without out-of-band documentation"
    ],
    "core_concepts_zh": [
      "第 0 级——POX 沼泽：所有操作使用单一 URI，通常用 XML 或 JSON 描述操作，相当于 HTTP 上的 SOAP 或 XML-RPC",
      "第 1 级——资源：多个 URI 各代表一个独立资源，但无论意图如何都只使用 POST/GET",
      "第 2 级——HTTP 动词：语义化使用完整的 HTTP 动词集合（GET、POST、PUT、PATCH、DELETE）对应 CRUD 操作，并返回有意义的 HTTP 状态码传达操作结果",
      "第 3 级——超媒体控制（HATEOAS）：响应中嵌入指向相关操作和资源的链接，让客户端完全通过超媒体导航 API，无需带外文档"
    ],
    "timeline": [
      [
        "2008",
        "Leonard Richardson presents the maturity model at QCon San Francisco, originally titled 'Justice Will Take Us Millions Of Intricate Moves'"
      ],
      [
        "2010",
        "Martin Fowler writes the definitive explanatory article on the Richardson Maturity Model on martinfowler.com, greatly popularizing it"
      ],
      [
        "2014",
        "Roy Fielding clarifies that Level 3 (HATEOAS) is the only true REST — lower levels are 'HTTP APIs', not REST APIs"
      ],
      [
        "2020",
        "Pragmatic industry consensus settles on Level 2 as the practical target; Level 3 adoption remains limited outside hypermedia specialists"
      ]
    ],
    "timeline_zh": [
      [
        "2008",
        "Leonard Richardson 在旧金山 QCon 上发表成熟度模型，原标题为「正义将带我们走过数百万复杂步骤」"
      ],
      [
        "2010",
        "Martin Fowler 在 martinfowler.com 上撰写关于 Richardson 成熟度模型的权威解释文章，使其广为人知"
      ],
      [
        "2014",
        "Roy Fielding 澄清第 3 级（HATEOAS）才是真正的 REST——更低级别是「HTTP API」，而非 REST API"
      ],
      [
        "2020",
        "行业务实共识将第 2 级定为实际目标；第 3 级的采用在超媒体专家之外仍然有限"
      ]
    ],
    "dos": [
      "Do target Level 2 as the baseline for all new APIs — consistent HTTP verbs and status codes provide significant value with modest implementation cost",
      "Do use HTTP status codes precisely: 201 for created resources with a Location header, 204 for successful deletes, 409 for conflicts, 422 for validation errors",
      "Do consider Level 3 (HATEOAS) for APIs used by machine clients that need to discover available state transitions at runtime without hardcoding paths",
      "Do use the model as a diagnostic tool when reviewing legacy APIs to identify which level they operate at and what improvements would advance them"
    ],
    "dos_zh": [
      "将第 2 级作为所有新 API 的基准——一致的 HTTP 动词和状态码以适度的实现成本提供显著价值",
      "精确使用 HTTP 状态码：201 用于已创建的资源（附带 Location 头部）、204 用于成功删除、409 用于冲突、422 用于验证错误",
      "对于需要在运行时发现可用状态转换而无需硬编码路径的机器客户端 API，考虑第 3 级（HATEOAS）",
      "在审查遗留 API 时将该模型作为诊断工具，识别其当前所处级别以及哪些改进能使其提升"
    ],
    "donts": [
      "Don't treat the model as a mandatory progression — most production APIs correctly stop at Level 2; Level 3 adds complexity that is only justified for specific use cases",
      "Don't conflate 'RESTful' with 'uses JSON over HTTP' — an API that ignores HTTP semantics is at Level 0 or 1 regardless of its payload format",
      "Don't add HATEOAS links for the sake of compliance without understanding how clients will use them — unused hypermedia adds payload weight without benefit",
      "Don't retrofit an existing Level 0 API to Level 2 without a migration plan — changing URIs and verbs is a breaking change that requires versioning"
    ],
    "donts_zh": [
      "不要将该模型视为必须逐级递进的要求——大多数生产 API 停在第 2 级是完全正确的；第 3 级增加的复杂性仅在特定用例中才合理",
      "不要将「RESTful」混同于「通过 HTTP 使用 JSON」——忽略 HTTP 语义的 API 无论载荷格式如何都处于第 0 或第 1 级",
      "不要为了合规而添加 HATEOAS 链接，却不理解客户端如何使用它们——无人使用的超媒体只会增加载荷体积而无任何收益",
      "不要在没有迁移计划的情况下将现有第 0 级 API 改造为第 2 级——更改 URI 和动词是破坏性变更，需要版本管理"
    ],
    "case_study_company": "GitHub",
    "case_study": "GitHub's REST API v3 is a widely studied Level 2 implementation. It uses resource-based URIs (/repos/{owner}/{repo}/pulls), proper HTTP verbs, and precise status codes (201 for created PRs, 204 for deleted resources, 422 for validation errors with structured error bodies). GitHub's API documentation explicitly lists which HTTP methods each endpoint supports and what each status code means. The GitHub REST API powers tens of thousands of developer tools and CI/CD integrations, demonstrating that a well-designed Level 2 API can scale to global adoption without needing HATEOAS.",
    "case_study_zh": "GitHub 的 REST API v3 是被广泛研究的第 2 级实现。它使用基于资源的 URI（如 /repos/{owner}/{repo}/pulls）、正确的 HTTP 动词，以及精确的状态码（201 用于已创建的 PR、204 用于已删除的资源、422 用于带结构化错误体的验证错误）。GitHub 的 API 文档明确列出每个端点支持的 HTTP 方法及每个状态码的含义。GitHub REST API 支撑着数以万计的开发者工具和 CI/CD 集成，证明了设计良好的第 2 级 API 无需 HATEOAS 即可扩展到全球规模。",
    "when_not_to_use": [
      "When building event-driven or streaming APIs where HTTP request-response semantics do not fit — use AsyncAPI and event schemas instead",
      "When the team is already operating at Level 2 consistently — the model has served its purpose and further discussion of levels adds no value",
      "When evaluating GraphQL or gRPC APIs — the model is specific to REST/HTTP APIs and does not apply to other API paradigms",
      "When a simple CRUD API with no consumer-facing discoverability needs is being built — Level 2 is sufficient without evaluating Level 3"
    ],
    "when_not_to_use_zh": [
      "当构建事件驱动或流式 API，HTTP 请求-响应语义不适用时——改用 AsyncAPI 和事件模式",
      "当团队已一致在第 2 级运作时——该模型已完成使命，进一步讨论级别没有任何价值",
      "当评估 GraphQL 或 gRPC API 时——该模型特定于 REST/HTTP API，不适用于其他 API 范式",
      "当构建没有消费者可发现性需求的简单 CRUD API 时——第 2 级已经足够，无需评估第 3 级"
    ],
    "adopters": [
      "GitHub",
      "Twilio",
      "PayPal",
      "Amazon Web Services",
      "Atlassian"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "usability",
      "reliability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Fowler, M. (2010). \"Richardson Maturity Model\". martinfowler.com.",
    "secondary_sources": [
      "Richardson, L. & Ruby, S. (2007). \"RESTful Web Services\". O'Reilly Media.",
      "Fielding, R. (2000). \"Architectural Styles and the Design of Network-based Software Architectures\". UC Irvine dissertation."
    ],
    "typed_relations": [
      {
        "slug": "openapi-specification",
        "type": "complement"
      },
      {
        "slug": "api-versioning-strategies",
        "type": "related"
      },
      {
        "slug": "api-first-design",
        "type": "complement"
      }
    ]
  },
  {
    "id": 242,
    "name": "API Security Patterns",
    "name_zh": "API 安全模式",
    "slug": "api-security-patterns",
    "category": "api",
    "desc": "OAuth2 scopes, API keys, JWT validation, and CORS hardening to protect API surfaces from unauthorized access",
    "desc_zh": "通过 OAuth2 范围、API 密钥、JWT 验证和 CORS 强化保护 API 表面免受未授权访问",
    "steps": [
      "Classify all API endpoints by sensitivity and select the appropriate authentication mechanism: API keys for server-to-server integrations, OAuth2 client credentials for machine-to-machine flows, OAuth2 authorization code + PKCE for user-delegated access, and mTLS for high-security service mesh communication",
      "Implement fine-grained OAuth2 scopes that follow the principle of least privilege: define scopes at the resource and action level (e.g., orders:read, orders:write, payments:create) and enforce scope checks in every protected handler",
      "Validate JWTs rigorously: verify the signature with the issuer's public key, check the exp, nbf, iss, and aud claims, reject tokens with algorithm: none, and implement token revocation via a fast-lookup denylist or short expiry windows",
      "Harden CORS configuration: explicitly whitelist allowed origins (avoid wildcard * for credentialed requests), restrict allowed methods and headers, and set the Access-Control-Max-Age header to reduce preflight request frequency",
      "Apply OWASP API Security Top 10 mitigations: enforce object-level authorization on every resource (BOLA), implement rate limiting and quota enforcement, validate all input payloads with strict schemas, and log all authentication events for audit and anomaly detection"
    ],
    "steps_zh": [
      "按敏感程度对所有 API 端点分类，选择合适的认证机制：API 密钥用于服务间集成，OAuth2 客户端凭证用于机器对机器流程，OAuth2 授权码 + PKCE 用于用户授权访问，mTLS 用于高安全性服务网格通信",
      "实施细粒度的 OAuth2 范围遵循最小权限原则：在资源和操作级别定义范围（如 orders:read、orders:write、payments:create），并在每个受保护的处理程序中执行范围检查",
      "严格验证 JWT：使用颁发者的公钥验证签名，检查 exp、nbf、iss 和 aud 声明，拒绝 algorithm: none 的令牌，并通过快速查找黑名单或短过期窗口实现令牌吊销",
      "强化 CORS 配置：明确白名单允许的来源（避免对凭证请求使用通配符 *），限制允许的方法和头部，设置 Access-Control-Max-Age 头部以减少预检请求频率",
      "应用 OWASP API 安全 Top 10 缓解措施：对每个资源执行对象级授权（BOLA）、实施速率限制和配额执行、用严格模式验证所有输入载荷，以及记录所有认证事件用于审计和异常检测"
    ],
    "ai_relevant": true,
    "viz_type": "radar",
    "viz_labels": [
      "Auth Mechanism",
      "OAuth2 Scopes",
      "JWT Validation",
      "CORS Config",
      "OWASP API Top 10"
    ],
    "viz_labels_zh": [
      "认证机制",
      "OAuth2范围",
      "JWT验证",
      "CORS配置",
      "API Top 10"
    ],
    "related": [
      "api-gateway-pattern",
      "zero-trust-architecture"
    ],
    "tags": [
      "security",
      "oauth2",
      "jwt",
      "cors",
      "owasp",
      "authentication",
      "authorization"
    ],
    "origin_author": "OWASP Foundation",
    "origin_source": "OWASP API Security Project (owasp.org/API-Security); OAuth 2.0 Authorization Framework (RFC 6749, IETF 2012); JSON Web Token (RFC 7519, IETF 2015)",
    "origin_source_zh": "OWASP API 安全项目（owasp.org/API-Security）；OAuth 2.0 授权框架（RFC 6749，IETF 2012）；JSON Web Token（RFC 7519，IETF 2015）",
    "complexity": "advanced",
    "when_to_use": [
      "When designing any API that exposes sensitive data, financial operations, or user-delegated actions to external consumers",
      "When a security audit or penetration test has identified authentication weaknesses (missing authorization checks, overly permissive CORS, weak token validation)",
      "When integrating with third-party systems that require standardized token-based authentication (OAuth2/OIDC)",
      "When compliance frameworks (PCI-DSS, HIPAA, SOC 2) mandate documented API authentication and authorization controls"
    ],
    "when_to_use_zh": [
      "当设计任何向外部消费者暴露敏感数据、金融操作或用户授权行为的 API 时",
      "当安全审计或渗透测试发现了认证弱点（缺失授权检查、过于宽松的 CORS、弱令牌验证）时",
      "当与需要标准化令牌认证（OAuth2/OIDC）的第三方系统集成时",
      "当合规框架（PCI-DSS、HIPAA、SOC 2）要求有文档记录的 API 认证和授权控制时"
    ],
    "core_concepts": [
      "OAuth2 Scopes and Least Privilege: Defining permission scopes at the resource-action level (e.g., accounts:read) and issuing tokens with only the scopes needed for the specific client integration, minimizing blast radius if a token is compromised",
      "JWT Validation Chain: The full validation sequence — signature verification, claim checks (exp, iss, aud), algorithm restriction — must all pass before a request is authorized; skipping any step creates exploitable vulnerabilities",
      "BOLA (Broken Object Level Authorization): The most prevalent API vulnerability, where an API returns data for any ID in the URL without checking that the authenticated user owns or has permission to access that object",
      "CORS Policy Hardening: Cross-Origin Resource Sharing misconfiguration (wildcard origins with credentials, overly permissive methods) allows malicious web pages to make authenticated API calls on behalf of browser users",
      "mTLS for Service-to-Service: Mutual TLS authentication, where both client and server present certificates, provides cryptographic proof of identity for high-security internal API communication in zero-trust environments"
    ],
    "core_concepts_zh": [
      "OAuth2 范围与最小权限：在资源-操作级别定义权限范围（如 accounts:read），并仅向特定客户端集成颁发所需范围的令牌，在令牌泄露时将影响范围降至最低",
      "JWT 验证链：完整的验证序列——签名验证、声明检查（exp、iss、aud）、算法限制——所有环节必须通过才能授权请求；跳过任何一步都会产生可利用的漏洞",
      "BOLA（对象级授权缺失）：最普遍的 API 漏洞，API 在不检查认证用户是否拥有或有权访问的情况下返回 URL 中任意 ID 的数据",
      "CORS 策略强化：跨源资源共享错误配置（通配符来源与凭证、过度宽松的方法）允许恶意网页代表浏览器用户发起认证 API 调用",
      "服务间 mTLS：双向 TLS 认证，客户端和服务端都出示证书，为零信任环境中高安全性内部 API 通信提供密码学身份证明"
    ],
    "timeline": [
      [
        "2007",
        "OAuth 1.0 published as an open standard, providing the first widely adopted framework for delegated API authorization"
      ],
      [
        "2012",
        "OAuth 2.0 (RFC 6749) and Bearer Token (RFC 6750) published, becoming the dominant API authorization framework"
      ],
      [
        "2015",
        "OWASP publishes the first API Security Top 10, cataloging the most critical API vulnerabilities including broken authentication and BOLA"
      ],
      [
        "2019",
        "OWASP API Security Top 10 2019 published; PKCE extension (RFC 7636) becomes mandatory for public OAuth2 clients to prevent authorization code interception"
      ]
    ],
    "timeline_zh": [
      [
        "2007",
        "OAuth 1.0 作为开放标准发布，提供了首个被广泛采用的委托 API 授权框架"
      ],
      [
        "2012",
        "OAuth 2.0（RFC 6749）和 Bearer Token（RFC 6750）发布，成为主导的 API 授权框架"
      ],
      [
        "2015",
        "OWASP 发布首个 API 安全 Top 10，编目包括认证缺陷和 BOLA 在内的最关键 API 漏洞"
      ],
      [
        "2019",
        "OWASP API 安全 Top 10 2019 版发布；PKCE 扩展（RFC 7636）成为公共 OAuth2 客户端防止授权码拦截的强制要求"
      ]
    ],
    "dos": [
      "Do validate the full JWT claim set — signature, expiry, issuer, and audience — on every request, using a well-maintained library rather than writing custom JWT validation code",
      "Do implement object-level authorization checks in every API handler that returns or modifies a specific resource, not just at the route/controller level",
      "Do rotate API keys and client secrets periodically and provide a self-service key rotation mechanism so clients can rotate without downtime",
      "Do log every authentication failure, token validation error, and authorization rejection with structured fields (client_id, endpoint, error_code) for security monitoring and incident response"
    ],
    "dos_zh": [
      "在每次请求时验证完整的 JWT 声明集——签名、过期时间、颁发者和受众——使用维护良好的库，而非编写自定义 JWT 验证代码",
      "在每个返回或修改特定资源的 API 处理程序中实施对象级授权检查，而非仅在路由/控制器层面",
      "定期轮换 API 密钥和客户端密钥，并提供自助密钥轮换机制，使客户端无需停机即可轮换",
      "用结构化字段（client_id、endpoint、error_code）记录每次认证失败、令牌验证错误和授权拒绝，用于安全监控和事件响应"
    ],
    "donts": [
      "Don't use HTTP Basic Auth or API keys in query parameters for anything beyond low-sensitivity internal tooling — both are easily leaked in logs, browser history, and referrer headers",
      "Don't trust client-supplied claims (user_id, role, is_admin) in JWT payloads without verifying them against your authorization database — JWT payload data is readable by anyone with the token",
      "Don't configure CORS with Access-Control-Allow-Origin: * combined with Access-Control-Allow-Credentials: true — browsers block this combination, and if somehow accepted it would allow any site to make credentialed requests",
      "Don't skip rate limiting on authentication endpoints (login, token exchange) — credential stuffing and brute-force attacks specifically target unthrottled auth endpoints"
    ],
    "donts_zh": [
      "不要对低敏感度内部工具以外的任何场景使用 HTTP Basic Auth 或查询参数中的 API 密钥——两者都容易在日志、浏览器历史和 Referer 头部中泄露",
      "不要在未针对授权数据库验证的情况下信任 JWT 载荷中客户端提供的声明（user_id、role、is_admin）——任何持有令牌的人都可以读取 JWT 载荷数据",
      "不要将 CORS 配置为 Access-Control-Allow-Origin: * 同时配合 Access-Control-Allow-Credentials: true——浏览器会阻止这种组合，如果以某种方式被接受则会允许任何站点发起凭证请求",
      "不要对认证端点（登录、令牌交换）跳过速率限制——凭证填充和暴力破解攻击专门针对未节流的认证端点"
    ],
    "case_study_company": "Cloudflare",
    "case_study": "Cloudflare's API security implementation serves as a reference for the industry. The Cloudflare API uses OAuth2 with fine-grained permission tokens that scope access to specific zones, account resources, and action types. In 2022, Cloudflare publicly documented their migration from coarse-grained API keys to token-based authentication with 180+ distinct permission scopes. After the migration, Cloudflare reported a significant reduction in over-privileged token usage and eliminated a class of account takeover incidents where a single leaked API key had provided complete account access.",
    "case_study_zh": "Cloudflare 的 API 安全实施是行业的参考标准。Cloudflare API 使用 OAuth2 和细粒度权限令牌，将访问范围限定到特定区域、账户资源和操作类型。2022 年，Cloudflare 公开记录了其从粗粒度 API 密钥迁移到基于令牌认证（拥有 180 多个不同权限范围）的过程。迁移后，Cloudflare 报告过度特权令牌的使用显著减少，并消除了一类因单个泄露 API 密钥提供完整账户访问而导致的账户接管事件。",
    "when_not_to_use": [
      "Fully internal APIs within a trusted service mesh that already enforces mTLS at the infrastructure level — adding application-layer OAuth2 on top adds overhead without security benefit",
      "Development and testing environments where the overhead of full OAuth2 flows slows iteration — use simplified API key auth with environment-specific secrets instead",
      "Static public data APIs (public transit schedules, weather data) with no personalization or sensitive data where authentication adds friction without protecting anything",
      "APIs in air-gapped or highly controlled internal networks where network-level controls already provide the required isolation"
    ],
    "when_not_to_use_zh": [
      "在基础设施层已强制执行 mTLS 的可信服务网格内的纯内部 API——在其上叠加应用层 OAuth2 只会增加开销而无安全收益",
      "开发和测试环境中，完整 OAuth2 流程的开销拖慢迭代速度——改用带环境特定密钥的简化 API 密钥认证",
      "没有个性化或敏感数据的静态公开数据 API（公共交通时刻表、天气数据），认证只会增加摩擦而不保护任何东西",
      "网络层控制已提供所需隔离的气隙或高度受控内部网络中的 API"
    ],
    "adopters": [
      "Cloudflare",
      "Okta",
      "Auth0",
      "Google",
      "Microsoft"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "security",
      "reliability",
      "observability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "OWASP Foundation (2023). \"OWASP API Security Top 10 2023\". owasp.org/API-Security.",
    "secondary_sources": [
      "IETF (2012). \"RFC 6749: The OAuth 2.0 Authorization Framework\". tools.ietf.org.",
      "IETF (2015). \"RFC 7519: JSON Web Token (JWT)\". tools.ietf.org.",
      "Madden, N. (2020). \"API Security in Action\". Manning Publications."
    ],
    "typed_relations": [
      {
        "slug": "api-gateway-pattern",
        "type": "complement"
      },
      {
        "slug": "zero-trust-architecture",
        "type": "extends"
      }
    ]
  },
  {
    "id": 243,
    "name": "Event-Driven API Design",
    "name_zh": "事件驱动 API 设计",
    "slug": "event-driven-api-design",
    "category": "api",
    "desc": "Server-Sent Events, WebSocket, and MQTT patterns for real-time, asynchronous API communication",
    "desc_zh": "服务端推送事件、WebSocket 和 MQTT 模式，用于实时、异步的 API 通信",
    "steps": [
      "Identify the real-time requirements: unidirectional server-to-client pushes (SSE), bidirectional full-duplex messaging (WebSocket), or constrained IoT device messaging with QoS guarantees (MQTT), then select the appropriate protocol",
      "Design the event schema: define event types, payload structure, and metadata (event_id, event_type, timestamp, correlation_id) using a schema registry (AsyncAPI, Avro, JSON Schema) to enable consumer validation and code generation",
      "Implement connection lifecycle management: handle reconnection with exponential backoff (SSE Last-Event-ID replay, WebSocket reconnect logic), define heartbeat/ping intervals to detect stale connections, and document maximum connection durations",
      "Apply backpressure and flow control: implement per-connection message queues with bounded capacity, use MQTT QoS levels (0, 1, 2) appropriate to message criticality, and expose metrics on queue depth and delivery latency",
      "Secure the event channel: authenticate the initial connection with the same OAuth2/JWT patterns used for REST, authorize subscription to specific event topics per client scope, and encrypt event payloads for sensitive data"
    ],
    "steps_zh": [
      "识别实时需求：单向服务端到客户端推送（SSE）、双向全双工消息（WebSocket），还是带有 QoS 保证的受限 IoT 设备消息（MQTT），然后选择合适的协议",
      "设计事件模式：使用模式注册表（AsyncAPI、Avro、JSON Schema）定义事件类型、载荷结构和元数据（event_id、event_type、timestamp、correlation_id），以支持消费者验证和代码生成",
      "实现连接生命周期管理：使用指数退避处理重连（SSE Last-Event-ID 回放、WebSocket 重连逻辑）、定义心跳/ping 间隔以检测陈旧连接，并记录最大连接持续时间",
      "应用背压和流量控制：为每个连接实现有界容量的消息队列，使用与消息重要性相匹配的 MQTT QoS 级别（0、1、2），并暴露队列深度和传递延迟的指标",
      "保护事件通道：使用与 REST 相同的 OAuth2/JWT 模式对初始连接进行认证，按客户端范围授权订阅特定事件主题，并对敏感数据的事件载荷加密"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Protocol Select",
      "Event Schema",
      "Connection Lifecycle",
      "Backpressure",
      "Secure Channel"
    ],
    "viz_labels_zh": [
      "协议选择",
      "事件模式",
      "连接管理",
      "背压控制",
      "通道安全"
    ],
    "related": [
      "api-gateway-pattern",
      "api-security-patterns"
    ],
    "tags": [
      "real-time",
      "websocket",
      "sse",
      "mqtt",
      "async",
      "event-driven",
      "streaming"
    ],
    "origin_author": "Clemens Vasters",
    "origin_source": "Clemens Vasters (2021). \"Messaging\". microsoft.github.io/cloud-design-patterns; AsyncAPI Specification (asyncapi.com); OASIS MQTT Standard (mqtt.org)",
    "origin_source_zh": "Clemens Vasters（2021）。「消息传递」，microsoft.github.io/cloud-design-patterns；AsyncAPI 规范（asyncapi.com）；OASIS MQTT 标准（mqtt.org）",
    "complexity": "advanced",
    "when_to_use": [
      "When clients need to react to server-side state changes (order status updates, live price feeds, collaborative editing) without polling",
      "When building IoT data pipelines where millions of devices publish sensor readings to a backend that must ingest and distribute them with QoS guarantees",
      "When a microservices-based system needs to decouple producers and consumers of domain events through a message broker, avoiding synchronous request chains",
      "When AI-generated streaming responses (LLM token streaming) must be delivered to clients as they are produced rather than waiting for the complete response"
    ],
    "when_to_use_zh": [
      "当客户端需要对服务端状态变化（订单状态更新、实时价格推送、协同编辑）作出响应而无需轮询时",
      "当构建 IoT 数据管道，数百万设备向后端发布传感器读数，后端必须以 QoS 保证摄取和分发这些数据时",
      "当基于微服务的系统需要通过消息代理解耦领域事件的生产者和消费者，避免同步请求链时",
      "当 AI 生成的流式响应（LLM 令牌流）需要在生成时立即传递给客户端，而非等待完整响应时"
    ],
    "core_concepts": [
      "Server-Sent Events (SSE): A unidirectional HTTP/1.1 streaming protocol where the server pushes events to the client over a persistent connection; clients reconnect automatically using the Last-Event-ID header for replay after disconnection",
      "WebSocket: A full-duplex, bidirectional protocol over a single TCP connection established via an HTTP upgrade handshake; suitable for interactive applications requiring low-latency bidirectional messaging",
      "MQTT: A lightweight publish-subscribe protocol designed for constrained IoT devices, providing three QoS levels (at most once, at least once, exactly once) and a topic-based routing model",
      "AsyncAPI: A specification format for event-driven APIs analogous to OpenAPI for REST; describes channels, message schemas, bindings (Kafka, AMQP, WebSocket), and server configurations",
      "Backpressure: The mechanism by which a consumer signals to a producer that it cannot keep up with the event rate, preventing unbounded queue growth and out-of-memory failures in high-throughput event streams"
    ],
    "core_concepts_zh": [
      "服务端推送事件（SSE）：一种单向 HTTP/1.1 流式协议，服务端通过持久连接向客户端推送事件；客户端断线后使用 Last-Event-ID 头部自动重连以进行回放",
      "WebSocket：通过 HTTP 升级握手建立的单个 TCP 连接上的全双工双向协议；适用于需要低延迟双向消息的交互式应用",
      "MQTT：为受限 IoT 设备设计的轻量级发布-订阅协议，提供三个 QoS 级别（最多一次、至少一次、恰好一次）和基于主题的路由模型",
      "AsyncAPI：事件驱动 API 的规范格式，类似于 REST 的 OpenAPI；描述通道、消息模式、绑定（Kafka、AMQP、WebSocket）和服务器配置",
      "背压：消费者向生产者发出信号表明无法跟上事件速率的机制，防止高吞吐量事件流中的无界队列增长和内存溢出故障"
    ],
    "timeline": [
      [
        "2006",
        "Comet and long-polling techniques emerge as workarounds for browser-based real-time communication before native WebSocket support"
      ],
      [
        "2011",
        "WebSocket protocol standardized as RFC 6455; HTML5 EventSource API (SSE) reaches W3C candidate recommendation"
      ],
      [
        "2014",
        "MQTT v3.1.1 standardized by OASIS; adopted as the primary IoT messaging protocol by AWS IoT, Azure IoT Hub, and Google Cloud IoT"
      ],
      [
        "2020",
        "AsyncAPI 2.0 published, providing the first comprehensive specification format for event-driven APIs across multiple protocols"
      ]
    ],
    "timeline_zh": [
      [
        "2006",
        "Comet 和长轮询技术作为浏览器原生 WebSocket 支持之前的实时通信变通方案出现"
      ],
      [
        "2011",
        "WebSocket 协议标准化为 RFC 6455；HTML5 EventSource API（SSE）达到 W3C 候选推荐状态"
      ],
      [
        "2014",
        "MQTT v3.1.1 由 OASIS 标准化；被 AWS IoT、Azure IoT Hub 和 Google Cloud IoT 采用为主要 IoT 消息协议"
      ],
      [
        "2020",
        "AsyncAPI 2.0 发布，为跨多种协议的事件驱动 API 提供首个综合规范格式"
      ]
    ],
    "dos": [
      "Do use SSE for unidirectional server-push use cases in web browsers — it works over HTTP/2, benefits from HTTP infrastructure (proxies, load balancers), and has automatic reconnect semantics built into the browser",
      "Do define event schemas in AsyncAPI or JSON Schema and validate inbound and outbound event payloads against them to catch schema drift before it reaches consumers",
      "Do implement exponential backoff with jitter for WebSocket and SSE reconnection to prevent thundering herd reconnection storms after a server restart or network partition",
      "Do expose per-consumer lag metrics (queue depth, delivery latency percentiles) so operations teams can detect backpressure conditions before they cause consumer failures"
    ],
    "dos_zh": [
      "在 Web 浏览器的单向服务端推送场景中使用 SSE——它基于 HTTP/2 工作，受益于 HTTP 基础设施（代理、负载均衡器），且浏览器内置自动重连语义",
      "在 AsyncAPI 或 JSON Schema 中定义事件模式，并针对这些模式验证入站和出站事件载荷，以便在模式漂移到达消费者之前捕获",
      "为 WebSocket 和 SSE 重连实现带抖动的指数退避，防止服务器重启或网络分区后的惊群重连风暴",
      "暴露每个消费者的延迟指标（队列深度、传递延迟百分位），使运维团队能在背压条件导致消费者故障之前检测到它"
    ],
    "donts": [
      "Don't use WebSocket for simple unidirectional push — SSE is simpler, works through HTTP proxies without configuration, and provides automatic reconnect, while WebSocket requires additional infrastructure support",
      "Don't send large binary payloads over WebSocket without framing and size limits — a single oversized message can exhaust server memory if not properly bounded",
      "Don't skip authentication on WebSocket upgrade requests — the initial HTTP handshake must validate credentials because subsequent messages will arrive without request headers",
      "Don't use MQTT QoS 0 (at most once) for critical messages where message loss is unacceptable — use QoS 1 or 2 based on the required delivery guarantee"
    ],
    "donts_zh": [
      "不要对简单的单向推送使用 WebSocket——SSE 更简单、无需配置即可通过 HTTP 代理、提供自动重连，而 WebSocket 需要额外的基础设施支持",
      "不要在没有帧处理和大小限制的情况下通过 WebSocket 发送大型二进制载荷——单个超大消息如果没有适当限制可能耗尽服务器内存",
      "不要跳过 WebSocket 升级请求的认证——初始 HTTP 握手必须验证凭证，因为后续消息到达时不会携带请求头",
      "不要对消息丢失不可接受的关键消息使用 MQTT QoS 0（最多一次）——根据所需的传递保证使用 QoS 1 或 2"
    ],
    "case_study_company": "Figma",
    "case_study": "Figma's real-time collaborative design editor uses WebSocket connections to synchronize canvas operations between all editors viewing the same file simultaneously. When a user moves an object or edits a shape, the operation is sent over WebSocket, transformed using operational transformation (OT) or CRDT algorithms to merge concurrent edits, and broadcast to all other connected clients within milliseconds. Figma's WebSocket infrastructure handles millions of simultaneous connections and supports collaborative sessions with dozens of concurrent editors without frame drops, demonstrating that WebSocket-based event-driven API design can power professional-grade real-time collaborative applications.",
    "case_study_zh": "Figma 的实时协作设计编辑器使用 WebSocket 连接在同时查看同一文件的所有编辑者之间同步画布操作。当用户移动对象或编辑图形时，操作通过 WebSocket 发送，使用操作变换（OT）或 CRDT 算法合并并发编辑，并在毫秒内广播到所有其他已连接客户端。Figma 的 WebSocket 基础设施处理数百万个同时连接，并在没有帧丢失的情况下支持数十个并发编辑者的协作会话，证明了基于 WebSocket 的事件驱动 API 设计可以为专业级实时协作应用提供支撑。",
    "when_not_to_use": [
      "Simple request-response interactions where the client initiates a query and receives a single response — REST or gRPC is simpler and better supported",
      "APIs where all communication is initiated by the client and server responses are always complete and synchronous — polling with a short interval is simpler to implement and debug",
      "Serverless functions (AWS Lambda, Cloudflare Workers) that terminate after a single request-response cycle — persistent connections require long-running processes",
      "High-security environments where long-lived TCP connections are prohibited by network policy and each request must be individually authenticated"
    ],
    "when_not_to_use_zh": [
      "客户端发起查询并接收单个响应的简单请求-响应交互——REST 或 gRPC 更简单且支持更好",
      "所有通信由客户端发起且服务端响应始终完整且同步的 API——短间隔轮询实现和调试更简单",
      "在单次请求-响应周期后终止的无服务器函数（AWS Lambda、Cloudflare Workers）——持久连接需要长时间运行的进程",
      "网络策略禁止长期 TCP 连接且每个请求必须单独认证的高安全性环境"
    ],
    "adopters": [
      "Figma",
      "Slack",
      "Discord",
      "Twitch",
      "AWS IoT"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "performance",
      "scalability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Vasters, C. (2021). \"Messaging Patterns\". microsoft.github.io/cloud-design-patterns.",
    "secondary_sources": [
      "IETF (2011). \"RFC 6455: The WebSocket Protocol\". tools.ietf.org.",
      "AsyncAPI Initiative (2020). \"AsyncAPI Specification 2.0\". asyncapi.com.",
      "OASIS (2014). \"MQTT Version 3.1.1\". docs.oasis-open.org."
    ],
    "typed_relations": [
      {
        "slug": "api-gateway-pattern",
        "type": "complement"
      },
      {
        "slug": "api-security-patterns",
        "type": "complement"
      }
    ]
  },
  {
    "id": 244,
    "name": "API Composition Pattern",
    "name_zh": "API 组合模式",
    "slug": "api-composition-pattern",
    "category": "api",
    "desc": "Aggregating multiple microservice APIs into a unified interface to fulfill client queries without requiring cross-service joins on the client side",
    "desc_zh": "将多个微服务 API 聚合为统一接口，让客户端无需进行跨服务联结即可满足查询需求",
    "steps": [
      "Identify the client query that requires data from multiple microservices and cannot be efficiently fulfilled by a single service without introducing tight coupling between services",
      "Implement a Composer service (or API Gateway policy) that accepts the client request, fans out parallel sub-requests to the involved microservices, and waits for all responses with a defined timeout",
      "Merge and transform the responses: join data on shared identifiers, filter fields to match the client's required schema, apply business rules (e.g., aggregate totals, derive computed fields), and produce a unified response",
      "Handle partial failures: define a fallback strategy for each sub-request (return cached data, return null, return an error indicator in the response) so a single service outage does not void the entire composite response",
      "Optimize for performance: parallelize independent sub-requests, cache sub-responses where freshness allows, and expose the composite endpoint through the API gateway with response caching and rate limiting"
    ],
    "steps_zh": [
      "识别需要来自多个微服务数据的客户端查询，该查询无法由单个服务高效满足，否则会引入服务间的紧密耦合",
      "实现一个组合器服务（或 API 网关策略），接收客户端请求，向涉及的微服务并行发出子请求，并在定义的超时时间内等待所有响应",
      "合并和转换响应：通过共享标识符联结数据、过滤字段以匹配客户端所需模式、应用业务规则（如聚合总计、推导计算字段），并生成统一响应",
      "处理部分失败：为每个子请求定义回退策略（返回缓存数据、返回 null、在响应中返回错误指示器），使单个服务中断不会使整个组合响应失效",
      "优化性能：并行化独立子请求，在允许新鲜度的地方缓存子响应，并通过 API 网关将组合端点暴露给外部，配置响应缓存和速率限制"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Identify Query",
      "Fan-Out",
      "Merge Transform",
      "Partial Failure",
      "Optimize Cache"
    ],
    "viz_labels_zh": [
      "识别查询",
      "扇出请求",
      "合并转换",
      "部分失败",
      "缓存优化"
    ],
    "related": [
      "api-gateway-pattern",
      "bff-pattern",
      "saga-pattern"
    ],
    "tags": [
      "microservices",
      "aggregation",
      "composition",
      "api-gateway",
      "fan-out"
    ],
    "origin_author": "Chris Richardson",
    "origin_source": "Chris Richardson (2018). \"Microservices Patterns\". Manning Publications, Ch. 11 (Developing business logic with Sagas); microservices.io/patterns/data/api-composition.html",
    "origin_source_zh": "Chris Richardson（2018）。《微服务架构设计模式》，Manning，第 11 章；microservices.io/patterns/data/api-composition.html",
    "complexity": "intermediate",
    "when_to_use": [
      "When a client UI screen needs to display data owned by multiple microservices (e.g., an order detail page that joins order, customer, inventory, and payment data)",
      "When the alternative — having the client make multiple sequential API calls — creates unacceptable latency due to the number of round-trips and network overhead",
      "When a mobile or web client has limited processing power or bandwidth and benefits from server-side aggregation that reduces payload size and number of requests",
      "When implementing a Backend for Frontend (BFF) layer that tailors the API surface to specific client types (mobile, web, partner)"
    ],
    "when_to_use_zh": [
      "当客户端 UI 页面需要显示多个微服务拥有的数据时（如订单详情页需要联结订单、客户、库存和支付数据）",
      "当替代方案——让客户端发起多次串行 API 调用——因往返次数和网络开销导致不可接受的延迟时",
      "当移动端或 Web 客户端处理能力或带宽有限，受益于服务端聚合以减少载荷大小和请求数量时",
      "当实现后端对前端（BFF）层，为特定客户端类型（移动端、Web、合作伙伴）定制 API 表面时"
    ],
    "core_concepts": [
      "Fan-Out/Fan-In: The composer fans out a single client request into multiple parallel sub-requests to individual microservices, then fans in by merging all sub-responses into a single unified response",
      "Partial Failure Tolerance: Designing the composite response to degrade gracefully when a sub-service is unavailable — returning cached, null, or indicator values rather than propagating the error to the entire response",
      "Data Join on Shared Keys: Combining data from multiple services using shared domain identifiers (order_id, customer_id) to produce a denormalized view that would require a database join in a monolithic architecture",
      "Timeout Budget: The composite operation must complete within a wall-clock timeout that accounts for the slowest sub-request; implementing per-sub-request timeouts prevents a single slow service from blocking the entire composition",
      "Idempotent Sub-Requests: Sub-requests should be read-only (GET operations) when possible; if writes are required as part of composition, the Saga pattern must be used to coordinate distributed transactions safely"
    ],
    "core_concepts_zh": [
      "扇出/扇入：组合器将单个客户端请求扇出为多个并行子请求发送至各微服务，再通过扇入将所有子响应合并为单个统一响应",
      "部分失败容错：设计组合响应在子服务不可用时优雅降级——返回缓存值、null 或指示器值，而非将错误传播到整个响应",
      "基于共享键的数据联结：使用共享领域标识符（order_id、customer_id）组合多个服务的数据，生成在单体架构中需要数据库联结的去规范化视图",
      "超时预算：组合操作必须在考虑最慢子请求的挂钟超时内完成；为每个子请求实现超时可防止单个慢速服务阻塞整个组合",
      "幂等子请求：子请求尽可能应为只读（GET 操作）；如果组合过程中需要写操作，必须使用 Saga 模式安全协调分布式事务"
    ],
    "timeline": [
      [
        "2014",
        "Microservices architecture popularized by Martin Fowler and James Lewis; data scatter across services makes API composition a necessary pattern"
      ],
      [
        "2015",
        "Netflix's Zuul and Amazon API Gateway popularize gateway-level request aggregation for microservices at scale"
      ],
      [
        "2018",
        "Chris Richardson formalizes API Composition as a named pattern in 'Microservices Patterns', distinguishing it from Saga-based composition for write operations"
      ],
      [
        "2022",
        "GraphQL federation (Apollo Federation) emerges as a declarative alternative to imperative API composition, composing schemas from multiple subgraph services"
      ]
    ],
    "timeline_zh": [
      [
        "2014",
        "Martin Fowler 和 James Lewis 推广微服务架构；数据分散于各服务使 API 组合成为必要模式"
      ],
      [
        "2015",
        "Netflix 的 Zuul 和 Amazon API Gateway 推广了微服务网关层请求聚合"
      ],
      [
        "2018",
        "Chris Richardson 在《微服务架构设计模式》中将 API 组合正式命名为模式，将其与用于写操作的基于 Saga 的组合区分开来"
      ],
      [
        "2022",
        "GraphQL 联邦（Apollo Federation）作为命令式 API 组合的声明式替代方案出现，从多个子图服务组合模式"
      ]
    ],
    "dos": [
      "Do parallelize all independent sub-requests — sequential composition multiplies latency and defeats the purpose of aggregation; use async/await or Promise.all patterns to execute concurrent sub-requests",
      "Do define a per-sub-request timeout that is shorter than the composite timeout so a slow downstream service is cut off before it causes the entire composition to time out",
      "Do version the composite API independently from individual microservice APIs — internal service changes should not require clients to update their composite endpoint calls",
      "Do cache sub-responses for reference data (product catalog, user profiles) that does not change per-request, reducing downstream service load on frequently composed queries"
    ],
    "dos_zh": [
      "并行化所有独立子请求——串行组合会成倍增加延迟，违背聚合目的；使用 async/await 或 Promise.all 模式执行并发子请求",
      "定义比组合超时更短的单个子请求超时，使慢速下游服务在导致整个组合超时之前被切断",
      "独立于各微服务 API 对组合 API 进行版本管理——内部服务变更不应要求客户端更新其组合端点调用",
      "对请求间不变的参考数据（产品目录、用户资料）缓存子响应，减少频繁组合查询对下游服务的负载"
    ],
    "donts": [
      "Don't use API Composition for write operations that must be atomic across services — use the Saga pattern with compensating transactions for distributed write coordination",
      "Don't compose more than 5-7 sub-services in a single composite request — deep fan-out increases latency variance, failure probability, and debugging complexity",
      "Don't expose the composer's internal service topology to clients through error messages — return client-friendly error codes and log internal sub-request failures separately",
      "Don't implement composition logic in the client — server-side composition keeps the network topology private, reduces client complexity, and allows the composition strategy to evolve without client changes"
    ],
    "donts_zh": [
      "不要将 API 组合用于必须跨服务原子执行的写操作——使用带补偿事务的 Saga 模式进行分布式写操作协调",
      "不要在单个组合请求中组合超过 5-7 个子服务——深度扇出会增加延迟方差、故障概率和调试复杂度",
      "不要通过错误消息将组合器的内部服务拓扑暴露给客户端——返回客户端友好的错误码并单独记录内部子请求失败",
      "不要在客户端实现组合逻辑——服务端组合保持网络拓扑的私密性、降低客户端复杂度，并允许组合策略在不影响客户端的情况下演进"
    ],
    "case_study_company": "Amazon",
    "case_study": "Amazon's product detail page is the canonical example of API Composition at scale. Displaying a single product page requires aggregating data from dozens of internal microservices: product catalog, pricing engine, inventory, review service, recommendation engine, seller profile, and fulfillment estimates. Amazon's internal framework composes these in parallel, with each sub-service returning its section of the page independently. Services that fail or exceed their timeout budget return a placeholder or cached value rather than breaking the page. This approach allows Amazon to deploy over 1,000 distinct microservices that each evolve independently while composing seamlessly into the product detail experience that serves billions of page views daily.",
    "case_study_zh": "亚马逊的商品详情页是大规模 API 组合的经典案例。显示单个商品页面需要聚合来自数十个内部微服务的数据：商品目录、定价引擎、库存、评论服务、推荐引擎、卖家资料和配送估算。亚马逊的内部框架并行组合这些服务，每个子服务独立返回页面的对应部分。失败或超出超时预算的服务返回占位符或缓存值，而不是破坏整个页面。这种方式让亚马逊能够部署超过 1000 个各自独立演进的微服务，同时无缝组合成每日提供数十亿次页面访问的商品详情体验。",
    "when_not_to_use": [
      "When the data needed for a client request lives in a single service — composition adds unnecessary infrastructure and latency overhead",
      "When the query requires write operations across multiple services — use the Saga pattern for distributed transaction coordination instead",
      "When all consuming clients are internal services that can tolerate multiple service calls — composition is primarily valuable for external clients with constrained network bandwidth or processing capacity",
      "When the services being composed have fundamentally incompatible SLAs — composing a 99.9% service with a 99% service yields a composite SLA of approximately 98.9%, which may be unacceptable"
    ],
    "when_not_to_use_zh": [
      "当客户端请求所需的数据来自单个服务时——组合会增加不必要的基础设施和延迟开销",
      "当查询需要跨多个服务的写操作时——改用 Saga 模式进行分布式事务协调",
      "当所有消费客户端都是可以容忍多次服务调用的内部服务时——组合主要对网络带宽或处理能力受限的外部客户端有价值",
      "当被组合的服务具有根本不兼容的 SLA 时——将 99.9% 服务与 99% 服务组合会产生约 98.9% 的组合 SLA，这可能无法接受"
    ],
    "adopters": [
      "Amazon",
      "Netflix",
      "Uber",
      "Airbnb",
      "LinkedIn"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "performance",
      "scalability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Richardson, C. (2018). \"Microservices Patterns\". Manning Publications. Ch. 8 (Querying in a microservice architecture).",
    "secondary_sources": [
      "Richardson, C. (2020). \"API Composition pattern\". microservices.io/patterns/data/api-composition.html.",
      "Newman, S. (2019). \"Monolith to Microservices\". O'Reilly Media.",
      "Fowler, M. & Lewis, J. (2014). \"Microservices\". martinfowler.com."
    ],
    "typed_relations": [
      {
        "slug": "api-gateway-pattern",
        "type": "complement"
      },
      {
        "slug": "bff-pattern",
        "type": "complement"
      },
      {
        "slug": "saga-pattern",
        "type": "related"
      }
    ]
  },
  {
    "id": 313,
    "name": "GraphQL Federation",
    "name_zh": "GraphQL联邦",
    "slug": "graphql-federation",
    "category": "api",
    "desc": "A composition model for GraphQL where multiple independently deployed subgraph services contribute to a unified supergraph, enabling teams to own their schema slice while consumers see a single coherent API",
    "desc_zh": "GraphQL的组合模型，多个独立部署的子图服务共同组成统一的超图，使团队能够拥有其模式切片，同时消费者看到单一连贯的API",
    "steps": [
      "Decompose the domain into bounded subgraphs: identify service boundaries (users, products, orders) and assign each team ownership of the GraphQL types and resolvers within their domain, starting with a thin supergraph across two or three services",
      "Annotate types with federation directives: use @key to declare entity primary keys, @extends to reference types owned by other subgraphs, and @external to mark fields resolved elsewhere — these directives encode the composition rules",
      "Deploy a GraphQL Router (Apollo Router, Cosmo Router) in front of all subgraphs: the router reads the composed supergraph schema, plans query execution across subgraphs, and handles entity resolution fetches transparently to clients",
      "Establish a schema registry with composition validation: push each subgraph schema to a central registry (Apollo GraphOS, Cosmo) before deployment; the registry runs composition checks to catch breaking changes and merge conflicts before they reach production",
      "Implement federated authorization: use per-subgraph auth middleware or router-level policy enforcement to apply consistent access control across the unified graph without exposing inter-service trust boundaries to clients"
    ],
    "steps_zh": [
      "将领域分解为有界子图：识别服务边界（用户、产品、订单）并将每个团队分配为其领域内GraphQL类型和解析器的所有者，从跨两三个服务的薄超图开始",
      "用联邦指令注解类型：使用@key声明实体主键，@extends引用其他子图拥有的类型，@external标记在其他地方解析的字段——这些指令编码了组合规则",
      "在所有子图前部署GraphQL路由器（Apollo Router、Cosmo Router）：路由器读取组合的超图模式，跨子图规划查询执行，并对客户端透明地处理实体解析获取",
      "建立带有组合验证的模式注册表：在部署前将每个子图模式推送到中央注册表（Apollo GraphOS、Cosmo）；注册表运行组合检查以在进入生产前捕获破坏性变更和合并冲突",
      "实施联邦授权：使用每个子图的认证中间件或路由器级策略执行，在统一图上应用一致的访问控制，而不向客户端暴露服务间信任边界"
    ],
    "ai_relevant": true,
    "viz_type": "tree",
    "viz_labels": [
      "Subgraphs",
      "Federation Directives",
      "GraphQL Router",
      "Schema Registry",
      "Federated Auth"
    ],
    "viz_labels_zh": [
      "子图",
      "联邦指令",
      "路由器",
      "模式注册",
      "联邦鉴权"
    ],
    "related": [
      "graphql-schema-design",
      "bff-pattern",
      "api-gateway-pattern",
      "consumer-driven-contracts",
      "api-first-design"
    ],
    "tags": [
      "graphql",
      "federation",
      "api",
      "supergraph",
      "microservices"
    ],
    "origin_author": "Apollo Graph, Inc. (Matt DeBergalis, Trevor Scheer et al.)",
    "origin_year": 2019,
    "origin_source": "Apollo Graph, Inc. (2019). \"Introducing Apollo Federation\". apollographql.com/blog/announcement/apollo-federation-f260cf525d21",
    "origin_source_zh": "Apollo Graph公司（2019）。《介绍Apollo联邦》。apollographql.com/blog/announcement/apollo-federation-f260cf525d21",
    "complexity": "advanced",
    "abstraction_level": "system",
    "maturity_ring": "established",
    "quality_concerns": [
      "maintainability",
      "scalability",
      "performance"
    ],
    "adopters": [
      "Netflix",
      "Expedia",
      "GitHub",
      "Wayfair",
      "Shopify"
    ],
    "when_to_use": [
      "Large organizations with multiple teams that need to contribute to a shared GraphQL API without stepping on each other's changes",
      "Migrating from a monolithic GraphQL schema to team-owned services while preserving a unified client-facing API surface",
      "Product applications requiring complex cross-domain queries (e.g., a checkout page fetching user, product, inventory, and pricing data in one request)",
      "Organizations adopting a platform engineering model where a central API team provides the routing layer and individual product teams own their subgraphs"
    ],
    "when_to_use_zh": [
      "多团队需要为共享GraphQL API做贡献而不互相干扰变更的大型组织",
      "从单体GraphQL模式迁移到团队拥有的服务，同时保留统一的面向客户端的API表面",
      "需要复杂跨领域查询的产品应用（例如，结账页面在一个请求中获取用户、产品、库存和定价数据）",
      "采用平台工程模型的组织，中央API团队提供路由层，各产品团队拥有其子图"
    ],
    "core_concepts": [
      "Supergraph / Subgraph: The supergraph is the unified schema clients query; it is composed from multiple subgraphs — independently deployable services each owning a slice of the overall type system",
      "Entity and @key Directive: An entity is a type that can be resolved across subgraphs by its primary key. The @key directive declares which fields uniquely identify an entity, enabling the router to fetch entity fields from their authoritative subgraph",
      "Composition: The process of merging subgraph schemas into a valid supergraph schema. Composition rules enforce type compatibility, catch redefinition conflicts, and ensure every field has exactly one resolver",
      "Query Planning: The router analyzes an incoming query against the supergraph schema and generates a query plan — a directed acyclic graph of fetch operations to individual subgraphs — optimized to minimize round trips",
      "Schema Registry: A versioned store of subgraph schemas with automated composition validation, used to enforce schema governance and prevent breaking changes from reaching production without compatibility checks"
    ],
    "core_concepts_zh": [
      "超图/子图：超图是客户端查询的统一模式；它由多个子图组成——独立可部署的服务，每个服务拥有整体类型系统的一个切片",
      "实体和@key指令：实体是可以通过主键跨子图解析的类型。@key指令声明哪些字段唯一标识实体，使路由器能够从其权威子图获取实体字段",
      "组合：将子图模式合并为有效超图模式的过程。组合规则强制类型兼容性，捕获重定义冲突，并确保每个字段都有且只有一个解析器",
      "查询规划：路由器根据超图模式分析传入查询并生成查询计划——针对各个子图的获取操作的有向无环图——优化以最小化往返次数",
      "模式注册表：具有自动组合验证的子图模式版本化存储，用于强制模式治理并防止破坏性变更在没有兼容性检查的情况下到达生产环境"
    ],
    "timeline": [
      [
        2012,
        "Facebook open-sources GraphQL specification, originally built for their News Feed mobile API"
      ],
      [
        2018,
        "GraphQL adoption grows; teams hit the monolithic schema scaling wall as multiple teams fight over a single schema repo"
      ],
      [
        2019,
        "Apollo Inc. publishes the GraphQL Federation specification and releases Apollo Gateway; the supergraph pattern is born"
      ],
      [
        2023,
        "Federation 2 (Apollo) and WunderGraph Cosmo provide open federation runtimes; the OpenFederation specification effort begins"
      ]
    ],
    "timeline_zh": [
      [
        2012,
        "Facebook开源GraphQL规范，最初为其新闻提要移动API而构建"
      ],
      [
        2018,
        "GraphQL采用率增长；团队在多个团队争夺单一模式仓库时遭遇单体模式扩展瓶颈"
      ],
      [
        2019,
        "Apollo公司发布GraphQL联邦规范并发布Apollo Gateway；超图模式诞生"
      ],
      [
        2023,
        "Federation 2（Apollo）和WunderGraph Cosmo提供开放联邦运行时；OpenFederation规范工作开始"
      ]
    ],
    "dos": [
      "Do start with a thin supergraph (2-3 subgraphs) and grow incrementally, because federation complexity compounds quickly with the number of subgraphs and cross-entity references",
      "Do run schema composition in CI before any subgraph deployment, so breaking changes are caught before they reach the router and affect clients",
      "Do version entity keys carefully — changing a @key field is a breaking change that requires coordinated migration across all subgraphs that reference that entity",
      "Do monitor query plan complexity; deeply nested cross-subgraph queries can generate N+1 fetch patterns that degrade performance under load"
    ],
    "dos_zh": [
      "从薄超图（2-3个子图）开始并逐步增长，因为联邦复杂性随子图数量和跨实体引用迅速增加",
      "在任何子图部署之前在CI中运行模式组合，以便在破坏性变更到达路由器并影响客户端之前捕获它们",
      "仔细对实体键进行版本控制——更改@key字段是一个破坏性变更，需要跨所有引用该实体的子图进行协调迁移",
      "监控查询计划复杂性；深度嵌套的跨子图查询可能产生N+1获取模式，在负载下降低性能"
    ],
    "donts": [
      "Don't create circular entity references between subgraphs — they create query plan deadlocks and make schema evolution extremely difficult",
      "Don't allow subgraphs to duplicate business logic for shared entities — the canonical resolver for each field should live in exactly one subgraph",
      "Don't expose the federation internals (subgraph URLs, entity keys) to external clients — the router is the only entry point and should abstract all composition details",
      "Don't skip performance testing of query plans across subgraph boundaries — a query innocent-looking in isolation can generate dozens of subgraph fetches at scale"
    ],
    "donts_zh": [
      "不要在子图之间创建循环实体引用——它们会造成查询计划死锁并使模式演进极其困难",
      "不要允许子图为共享实体重复业务逻辑——每个字段的权威解析器应该只存在于一个子图中",
      "不要向外部客户端暴露联邦内部（子图URL、实体键）——路由器是唯一的入口点，应该抽象所有组合细节",
      "不要跳过跨子图边界的查询计划性能测试——看起来无害的查询在规模化时可能产生数十个子图获取"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix adopted Apollo Federation to unify the GraphQL APIs of over 30 studio and streaming microservices behind a single supergraph consumed by their user-facing applications. Before federation, each product team maintained a separate GraphQL endpoint, forcing client teams to orchestrate multi-API calls and stitch responses manually. After migrating to federation, a single studio operations query that previously required 7 API calls was reduced to 1 federated query. Schema change coordination time dropped from 2-week release trains to continuous deployment with automated composition checks.",
    "case_study_zh": "Netflix采用Apollo联邦将超过30个工作室和流媒体微服务的GraphQL API统一在单一超图后面，供其面向用户的应用程序消费。在联邦之前，每个产品团队维护一个单独的GraphQL端点，迫使客户端团队手动协调多API调用和拼接响应。迁移到联邦后，之前需要7次API调用的单个工作室操作查询减少为1次联邦查询。模式变更协调时间从2周发布火车下降到具有自动组合检查的持续部署。",
    "when_not_to_use": [
      "Small teams with a single GraphQL service — federation introduces router infrastructure and composition tooling overhead that is not justified without multiple schema owners",
      "Teams new to GraphQL — learn core GraphQL patterns (schema design, resolvers, N+1 with DataLoader) before adding federation complexity",
      "Read-heavy APIs with simple, non-relational data where REST or tRPC is simpler to operate and reason about",
      "Organizations without a schema registry discipline — federation without enforced schema governance leads to accidental breaking changes that damage client trust"
    ],
    "when_not_to_use_zh": [
      "只有单一GraphQL服务的小团队——联邦引入了路由器基础设施和组合工具开销，在没有多个模式所有者的情况下不合理",
      "GraphQL新手团队——在增加联邦复杂性之前，先学习核心GraphQL模式（模式设计、解析器、使用DataLoader的N+1）",
      "具有简单非关系型数据的读密集型API，REST或tRPC更易于操作和推理",
      "没有模式注册表规范的组织——没有强制模式治理的联邦会导致意外的破坏性变更，损害客户端信任"
    ],
    "primary_source": "Apollo Graph, Inc. (2019). \"Apollo Federation Specification\". apollographql.com/docs/federation/",
    "primary_source_zh": "Apollo Graph公司（2019）。《Apollo联邦规范》。apollographql.com/docs/federation/",
    "secondary_sources": [
      "Bhatt, A. (2021). \"How Netflix Scaled their API with GraphQL Federation\". Netflix Technology Blog.",
      "Betts, J. (2020). \"GraphQL Federation: Putting the Graph in GraphQL\". InfoQ."
    ],
    "secondary_sources_zh": [
      "Bhatt, A.（2021）。《Netflix如何用GraphQL联邦扩展其API》。Netflix技术博客。",
      "Betts, J.（2020）。《GraphQL联邦：将图放入GraphQL》。InfoQ。"
    ]
  },
  {
    "id": 141,
    "name": "Spotify Model",
    "name_zh": "Spotify 模型",
    "slug": "spotify-model",
    "category": "team",
    "desc": "Organize engineering around autonomous squads grouped into tribes, with cross-cutting chapters and guilds for alignment",
    "desc_zh": "围绕自治小队组织工程团队，小队按业务线归入部落，通过跨部门的分会与公会实现对齐",
    "steps": [
      "Form small cross-functional squads (6-8 people) each owning a specific mission or product area",
      "Group related squads into tribes (≤100 people) sharing a common business domain",
      "Establish chapters as functional groupings (e.g., backend, QA) within a tribe for skill development and line management",
      "Create guilds as voluntary communities of interest that span the entire organization",
      "Empower squad autonomy for decisions on tools, processes, and technical approach while aligning on tribe-level goals"
    ],
    "steps_zh": [
      "组建小型跨职能小队（6-8 人），每个小队负责特定的使命或产品领域",
      "将相关小队归入部落（≤100 人），共享相同的业务领域",
      "在部落内建立分会，按职能分组（如后端、QA），用于技能发展和直线管理",
      "创建公会，作为跨组织的自愿兴趣社区",
      "赋予小队在工具、流程和技术方案上的自主决策权，同时在部落层面保持目标对齐"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Squad",
      "Tribe",
      "Chapter",
      "Guild",
      "Autonomy"
    ],
    "viz_labels_zh": [
      "小队",
      "部落",
      "分会",
      "协会",
      "自治"
    ],
    "related": [
      "team-topologies",
      "conways-law",
      "amazon-two-pizza-teams",
      "inverse-conway-maneuver"
    ],
    "tags": [
      "team-structure",
      "autonomy",
      "scaling",
      "agile",
      "spotify"
    ],
    "origin_author": "Henrik Kniberg & Anders Ivarsson, 2012",
    "origin_source": "Scaling Agile @ Spotify (Kniberg & Ivarsson, 2012 whitepaper); referenced in Team Topologies (Skelton & Pais, 2019)",
    "origin_source_zh": "《Spotify 的规模化敏捷》（Kniberg & Ivarsson，2012 白皮书）；在《团队拓扑》（Skelton & Pais，2019）中有引用",
    "complexity": "advanced",
    "when_to_use": [
      "Scaling an engineering organization beyond 50 engineers while preserving startup-like autonomy",
      "Reducing cross-team dependencies that slow down delivery in a growing product company",
      "Fostering a strong engineering culture where teams choose their own tools and practices",
      "Organizations transitioning from rigid hierarchical structures to mission-driven teams"
    ],
    "when_to_use_zh": [
      "将工程组织扩展至 50 人以上规模的同时保持创业公司般的自主性",
      "减少成长中产品公司因跨团队依赖导致的交付速度下降",
      "培育团队自主选择工具和实践的强工程文化",
      "组织从刚性层级结构向使命驱动团队转型"
    ],
    "core_concepts": [
      "Squad: The basic unit of delivery — a small, autonomous, cross-functional team with end-to-end ownership of a product area or mission",
      "Tribe: A collection of squads (up to ~100 people) working in a related business area, led by a tribe lead who fosters collaboration",
      "Chapter: A functional grouping within a tribe (e.g., all iOS developers) for skill sharing, mentoring, and line management",
      "Guild: A lightweight, voluntary community of practice spanning the entire organization, enabling cross-pollination of knowledge",
      "Alignment vs. Autonomy: The model explicitly balances organizational alignment (what to build) with squad autonomy (how to build it)"
    ],
    "core_concepts_zh": [
      "小队：交付的基本单元——一个小型、自治、跨职能的团队，端到端负责一个产品领域或使命",
      "部落：在相关业务领域工作的小队集合（最多约 100 人），由部落负责人引导协作",
      "分会：部落内的职能分组（如所有 iOS 开发者），用于技能分享、导师制和直线管理",
      "公会：跨组织的轻量级自愿实践社区，促进知识交叉传播",
      "对齐与自主的平衡：该模型显式地平衡组织对齐（构建什么）与小队自主（如何构建）"
    ],
    "timeline": [
      [
        "2012",
        "Henrik Kniberg and Anders Ivarsson publish the 'Scaling Agile @ Spotify' whitepaper describing squads, tribes, chapters, and guilds"
      ],
      [
        "2014",
        "Spotify releases a companion video series that popularizes the model across the industry"
      ],
      [
        "2016",
        "ING Bank publicly adopts a Spotify-inspired structure during its agile transformation, sparking enterprise adoption"
      ],
      [
        "2019",
        "Skelton and Pais critique cargo-culting of the model in Team Topologies, emphasizing that Spotify's own structure evolved beyond the original paper"
      ],
      [
        "2022",
        "Jeremiah Lee publishes 'Spotify's Failed Squad Goals', prompting reassessment of blind adoption"
      ]
    ],
    "timeline_zh": [
      [
        "2012",
        "Henrik Kniberg 和 Anders Ivarsson 发表《Spotify 的规模化敏捷》白皮书，描述小队、部落、分会和公会结构"
      ],
      [
        "2014",
        "Spotify 发布配套视频系列，使该模型在业界广泛传播"
      ],
      [
        "2016",
        "ING 银行在敏捷转型中公开采用受 Spotify 启发的结构，引发企业级采纳浪潮"
      ],
      [
        "2019",
        "Skelton 和 Pais 在《团队拓扑》中批评对该模型的「货物崇拜」式照搬，强调 Spotify 自身结构已超越原始论文"
      ],
      [
        "2022",
        "Jeremiah Lee 发表《Spotify 失败的小队目标》，促使业界重新审视盲目采纳"
      ]
    ],
    "dos": [
      "Adapt the model to your organization's culture rather than copying it verbatim from the whitepaper",
      "Invest in strong chapter leads who balance people management with technical excellence across squads",
      "Define clear squad missions and measurable outcomes to prevent squads from becoming siloed feature factories",
      "Use guilds actively to share practices, not just as inactive Slack channels"
    ],
    "dos_zh": [
      "根据自身组织文化调整该模型，而非逐字照搬白皮书",
      "投资培养强有力的分会负责人，在跨小队范围内平衡人员管理与技术卓越",
      "为小队定义清晰的使命和可衡量的成果，防止小队沦为孤立的功能工厂",
      "积极利用公会分享实践，而不仅仅作为不活跃的 Slack 频道"
    ],
    "donts": [
      "Don't cargo-cult the Spotify model — even Spotify itself admitted the paper described an aspiration, not reality",
      "Don't create squads without clear missions, because autonomy without purpose leads to fragmentation",
      "Don't neglect cross-squad coordination — tribal alignment mechanisms are as important as squad autonomy",
      "Don't assume the model works at all scales — organizations under 30 engineers rarely need this complexity"
    ],
    "donts_zh": [
      "不要「货物崇拜」式地照搬 Spotify 模型——连 Spotify 自身都承认论文描述的是愿景而非现实",
      "不要在没有清晰使命的情况下创建小队，因为没有目标的自主权会导致碎片化",
      "不要忽视跨小队协调——部落层面的对齐机制与小队自主权同样重要",
      "不要假设该模型在所有规模都适用——30 人以下的工程组织很少需要这种复杂度"
    ],
    "case_study_company": "ING Bank",
    "case_study": "ING Bank Netherlands underwent a massive agile transformation in 2015 inspired by the Spotify model. They reorganized 3,500 employees into roughly 350 squads grouped into 13 tribes, eliminating traditional department boundaries. Within two years, ING reported a 30% improvement in time-to-market for new features, though they publicly acknowledged needing to adapt the model significantly — adding product owners per tribe and stronger cross-tribe dependency management that the original Spotify whitepaper did not address.",
    "case_study_zh": "ING 银行荷兰分部于 2015 年受 Spotify 模型启发进行了大规模敏捷转型。他们将 3,500 名员工重组为约 350 个小队，归入 13 个部落，打破了传统部门边界。两年内，ING 报告新功能上市时间缩短了 30%，但他们公开承认需要对模型进行重大调整——增加了每个部落的产品负责人以及更强的跨部落依赖管理机制，这些都是原始 Spotify 白皮书未涉及的内容。",
    "when_not_to_use": [
      "Small organizations (under 30 engineers) where the overhead of tribes and chapters exceeds the coordination benefit",
      "Highly regulated industries requiring strict hierarchical approval chains that conflict with squad autonomy",
      "Organizations without mature engineering culture — the model assumes high trust and psychological safety",
      "Teams with deeply coupled codebases where true squad autonomy over independent deployments is impossible"
    ],
    "when_not_to_use_zh": [
      "小型组织（30 名工程师以下），部落和分会的管理开销超过协调收益",
      "高度监管行业要求严格的层级审批链，与小队自主权相冲突",
      "工程文化尚不成熟的组织——该模型假设存在高度信任和心理安全感",
      "代码库深度耦合的团队，无法实现小队对独立部署的真正自主"
    ],
    "adopters": [
      "Spotify",
      "ING Bank",
      "Zalando",
      "Gilt",
      "Elsevier"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability",
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Kniberg, H. & Ivarsson, A. (2012). \"Scaling Agile @ Spotify with Tribes, Squads, Chapters & Guilds\". Spotify Labs Whitepaper.",
    "secondary_sources": [
      "Skelton, M. & Pais, M. (2019). \"Team Topologies: Organizing Business and Technology Teams for Fast Flow\". IT Revolution Press.",
      "Lee, J. (2020). \"Spotify's Failed Squad Goals\". jeremiahlee.com."
    ],
    "typed_relations": [
      {
        "slug": "team-topologies",
        "type": "alternative"
      },
      {
        "slug": "conways-law",
        "type": "prerequisite"
      },
      {
        "slug": "amazon-two-pizza-teams",
        "type": "alternative"
      },
      {
        "slug": "inverse-conway-maneuver",
        "type": "related"
      }
    ]
  },
  {
    "id": 142,
    "name": "Amazon Two-Pizza Teams",
    "name_zh": "亚马逊两个披萨团队",
    "slug": "amazon-two-pizza-teams",
    "category": "team",
    "desc": "Limit team size to what two pizzas can feed (~6-10 people) to maximize ownership and minimize communication overhead",
    "desc_zh": "将团队规模限制在两个披萨能喂饱的人数（约 6-10 人），以最大化责任归属感并最小化沟通开销",
    "steps": [
      "Decompose the product or platform into independently deliverable service boundaries",
      "Assign each service boundary to a small team of 6-10 people with full-stack capability",
      "Grant each team end-to-end ownership: build, deploy, operate, and support their service",
      "Define a clear 'fitness function' or set of metrics each team is accountable for",
      "Minimize inter-team dependencies by designing service APIs as team contracts"
    ],
    "steps_zh": [
      "将产品或平台分解为可独立交付的服务边界",
      "为每个服务边界分配一个 6-10 人的小型全栈团队",
      "赋予每个团队端到端的所有权：构建、部署、运维和支持其服务",
      "为每个团队定义明确的「适应度函数」或其负责的指标集",
      "通过将服务 API 设计为团队契约来最小化团队间依赖"
    ],
    "ai_relevant": false,
    "viz_type": "matrix",
    "viz_labels": [
      "Service Boundary",
      "Small Team",
      "End-to-End Ownership",
      "Fitness Function",
      "API Contract"
    ],
    "viz_labels_zh": [
      "服务边界",
      "小团队",
      "端到端所有权",
      "适应性函数",
      "API契约"
    ],
    "related": [
      "conways-law",
      "team-topologies",
      "spotify-model",
      "inverse-conway-maneuver"
    ],
    "tags": [
      "team-size",
      "autonomy",
      "ownership",
      "microservices",
      "amazon"
    ],
    "origin_author": "Jeff Bezos, ~2002",
    "origin_source": "Internal Amazon mandate; described in The Everything Store (Brad Stone, 2013) and referenced in Accelerate (Forsgren, Humble & Kim, 2018)",
    "origin_source_zh": "亚马逊内部指令；在《一网打尽》（Brad Stone，2013）中有描述，在《加速》（Forsgren、Humble & Kim，2018）中有引用",
    "complexity": "intermediate",
    "when_to_use": [
      "Organizations experiencing coordination bottlenecks as teams grow beyond 10 people",
      "Building a microservices architecture where each service needs a clear owning team",
      "Scaling a fast-growing startup while preserving speed and accountability",
      "Environments where 'you build it, you run it' operational ownership is desired"
    ],
    "when_to_use_zh": [
      "组织在团队超过 10 人后出现协调瓶颈",
      "构建微服务架构，每个服务需要明确的负责团队",
      "快速成长的创业公司在扩展规模的同时保持速度和责任",
      "期望实现「谁构建，谁运维」运营归属制的环境"
    ],
    "core_concepts": [
      "Small team size: Brooks's Law shows communication overhead grows quadratically with team size — two-pizza teams stay under the threshold where coordination costs dominate",
      "Full ownership: Each team owns the entire lifecycle of their service, from design through production operations and on-call",
      "Service-oriented architecture: The organizational pattern is inseparable from the technical architecture — small teams map to small, decoupled services",
      "Single-threaded leadership: Each two-pizza team has one leader whose sole focus is that team's mission, avoiding divided attention",
      "API-as-contract: Teams communicate through well-defined service interfaces, reducing the need for face-to-face coordination"
    ],
    "core_concepts_zh": [
      "小团队规模：布鲁克斯定律表明沟通开销随团队规模二次增长——两个披萨团队保持在协调成本主导的阈值之下",
      "完全所有权：每个团队拥有其服务的完整生命周期，从设计到生产运维和值班",
      "面向服务架构：组织模式与技术架构不可分割——小团队映射为小型解耦服务",
      "单线程领导力：每个两个披萨团队有一位领导者，其唯一关注点是该团队的使命，避免注意力分散",
      "API 即契约：团队通过定义明确的服务接口通信，减少面对面协调的需要"
    ],
    "timeline": [
      [
        "2002",
        "Jeff Bezos issues the famous API mandate and two-pizza team rule at Amazon, forcing all teams to communicate only through service interfaces"
      ],
      [
        "2006",
        "Amazon Web Services launches, itself built by two-pizza teams, validating the organizational model at scale"
      ],
      [
        "2013",
        "Brad Stone documents the two-pizza rule in The Everything Store, bringing it mainstream"
      ],
      [
        "2018",
        "Forsgren, Humble and Kim cite small, autonomous teams as a key predictor of high software delivery performance in Accelerate"
      ],
      [
        "2020",
        "The pattern becomes standard practice in cloud-native organizations, reinforced by the microservices movement"
      ]
    ],
    "timeline_zh": [
      [
        "2002",
        "Jeff Bezos 在亚马逊发布著名的 API 指令和两个披萨团队规则，强制所有团队只通过服务接口通信"
      ],
      [
        "2006",
        "AWS 发布，其自身就由两个披萨团队构建，在规模上验证了该组织模型"
      ],
      [
        "2013",
        "Brad Stone 在《一网打尽》中记录了两个披萨规则，使其进入主流视野"
      ],
      [
        "2018",
        "Forsgren、Humble 和 Kim 在《加速》中将小型自治团队列为高软件交付绩效的关键预测因子"
      ],
      [
        "2020",
        "该模式在云原生组织中成为标准实践，受微服务运动强化"
      ]
    ],
    "dos": [
      "Ensure each team has the full-stack capability to deliver independently without waiting on other teams",
      "Define clear service boundaries and APIs before forming teams, so team structure reflects architecture (Inverse Conway)",
      "Give teams real ownership by making them responsible for operational metrics, not just shipping features",
      "Keep the team's mission focused — a two-pizza team with a sprawling mandate is worse than a large team"
    ],
    "dos_zh": [
      "确保每个团队具备完整的全栈能力，能够独立交付而无需等待其他团队",
      "在组建团队之前定义清晰的服务边界和 API，使团队结构反映架构（反向康威策略）",
      "通过让团队对运营指标负责而非仅负责功能交付，赋予团队真正的所有权",
      "保持团队使命聚焦——职责蔓延的两个披萨团队还不如一个大团队"
    ],
    "donts": [
      "Don't split teams artificially just to hit the size target — the service boundary must be meaningful",
      "Don't create two-pizza teams without supporting platform infrastructure, or each team will reinvent the wheel",
      "Don't ignore Brooks's corollary: splitting work across too many small teams creates its own integration overhead",
      "Don't forget that two-pizza teams still need cross-team alignment mechanisms like architecture reviews or RFCs"
    ],
    "donts_zh": [
      "不要为了达到人数目标而人为拆分团队——服务边界必须有实际意义",
      "不要在没有配套平台基础设施的情况下创建两个披萨团队，否则每个团队都会重复造轮子",
      "不要忽视布鲁克斯定律的推论：将工作拆分到过多小团队会产生自身的集成开销",
      "不要忘记两个披萨团队仍需要跨团队对齐机制，如架构评审或 RFC"
    ],
    "case_study_company": "Amazon",
    "case_study": "Amazon's own transformation from a monolithic bookstore application to a services-based architecture in the early 2000s was driven by the two-pizza team mandate. By 2006, Amazon had decomposed into hundreds of small teams, each owning a distinct service. This organizational structure directly enabled the creation of AWS — teams that built internal infrastructure services realized they could offer them externally. The model is credited by former VP Charlie Bell as foundational to Amazon's ability to scale from $4B to $500B+ in revenue while maintaining engineering velocity.",
    "case_study_zh": "亚马逊自身从单体书店应用向基于服务的架构转型（2000 年代初）就是由两个披萨团队指令驱动的。到 2006 年，亚马逊已分解为数百个小团队，每个团队拥有一个独立服务。这种组织结构直接催生了 AWS——构建内部基础设施服务的团队意识到可以将其对外提供。前副总裁 Charlie Bell 将此模型视为亚马逊从 40 亿美元增长至 5000 亿美元以上收入同时保持工程速度的基石。",
    "when_not_to_use": [
      "Tightly coupled monolithic systems where meaningful service decomposition is not yet feasible",
      "Organizations too small to have more than 2-3 teams — the overhead of inter-team contracts exceeds the benefit",
      "Research-heavy environments where exploratory work requires fluid, cross-cutting collaboration",
      "Contexts where shared domain expertise is critical and splitting it across teams would dilute knowledge"
    ],
    "when_not_to_use_zh": [
      "紧耦合单体系统，有意义的服务分解尚不可行",
      "组织规模太小，不超过 2-3 个团队——团队间契约的开销超过收益",
      "以研究为主的环境，探索性工作需要流动的跨领域协作",
      "共享领域专业知识至关重要的场景，拆分到多个团队会稀释知识"
    ],
    "adopters": [
      "Amazon",
      "Netflix",
      "Google",
      "Uber",
      "Twilio"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability",
      "scalability"
    ],
    "maturity_ring": "established",
    "primary_source": "Stone, B. (2013). \"The Everything Store: Jeff Bezos and the Age of Amazon\". Little, Brown and Company.",
    "secondary_sources": [
      "Bryar, C. & Carr, B. (2021). \"Working Backwards: Insights, Stories, and Secrets from Inside Amazon\". St. Martin's Press.",
      "Forsgren, N., Humble, J. & Kim, G. (2018). \"Accelerate: The Science of Lean Software and DevOps\". IT Revolution Press."
    ],
    "typed_relations": [
      {
        "slug": "conways-law",
        "type": "prerequisite"
      },
      {
        "slug": "team-topologies",
        "type": "complement"
      },
      {
        "slug": "spotify-model",
        "type": "alternative"
      },
      {
        "slug": "inverse-conway-maneuver",
        "type": "related"
      }
    ]
  },
  {
    "id": 143,
    "name": "Inner Source",
    "name_zh": "内部开源",
    "slug": "inner-source",
    "category": "team",
    "desc": "Apply open-source development practices within an organization to break down silos and improve code reuse",
    "desc_zh": "在组织内部应用开源开发实践，打破部门壁垒，提升代码复用",
    "steps": [
      "Identify candidate projects with broad internal consumers and establish them as inner-source repositories",
      "Define contribution guidelines, code review standards, and a CONTRIBUTING.md for each inner-source project",
      "Designate trusted committers from the owning team who review and merge external contributions",
      "Encourage teams to submit pull requests to shared codebases instead of forking or duplicating functionality",
      "Measure adoption through contribution metrics: cross-team PRs, contributor diversity, and reuse rates"
    ],
    "steps_zh": [
      "识别拥有广泛内部消费者的候选项目，将其建立为内部开源仓库",
      "为每个内部开源项目定义贡献指南、代码评审标准和 CONTRIBUTING.md",
      "从拥有团队中指定受信任的提交者，负责评审和合并外部贡献",
      "鼓励团队向共享代码库提交拉取请求，而非分叉或重复实现功能",
      "通过贡献指标衡量采纳情况：跨团队 PR 数量、贡献者多样性和复用率"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Inner Source Repo",
      "Contribution Guide",
      "Trusted Committer",
      "Cross-Team PR",
      "Metrics"
    ],
    "viz_labels_zh": [
      "内部源码库",
      "贡献指南",
      "受信提交者",
      "跨团队PR",
      "度量指标"
    ],
    "related": [
      "platform-engineering",
      "conways-law",
      "team-topologies",
      "amazon-two-pizza-teams"
    ],
    "tags": [
      "collaboration",
      "open-source",
      "code-reuse",
      "inner-source",
      "culture"
    ],
    "origin_author": "Tim O'Reilly, 2000; formalized by PayPal (Danese Cooper, ~2014)",
    "origin_source": "InnerSource Commons Foundation; referenced in Accelerate (Forsgren, Humble & Kim, 2018) and Team Topologies (Skelton & Pais, 2019)",
    "origin_source_zh": "InnerSource Commons 基金会；在《加速》（Forsgren、Humble & Kim，2018）和《团队拓扑》（Skelton & Pais，2019）中有引用",
    "complexity": "intermediate",
    "when_to_use": [
      "Multiple teams duplicating similar functionality because they cannot contribute to each other's codebases",
      "Shared libraries or platform components that evolve too slowly because only one team owns them",
      "Organizations wanting to improve engineering culture by promoting transparency and collaboration",
      "Large enterprises seeking to reduce bottlenecks on central platform teams via distributed contribution"
    ],
    "when_to_use_zh": [
      "多个团队因无法向彼此的代码库贡献而重复实现相似功能",
      "共享库或平台组件因仅一个团队拥有而演进过慢",
      "希望通过推广透明和协作来改善工程文化的组织",
      "大型企业寻求通过分布式贡献减少中央平台团队的瓶颈"
    ],
    "core_concepts": [
      "Trusted Committer: A designated reviewer from the owning team who ensures external contributions meet quality standards and project direction",
      "Contribution guidelines: Explicit documentation (CONTRIBUTING.md, coding standards, PR templates) that lowers the barrier for cross-team contributions",
      "Transparent development: All code, issues, roadmaps, and design decisions are visible to the entire organization, mimicking open-source transparency",
      "Guest contributors: Engineers from consuming teams who submit patches to fix bugs or add features they need, rather than waiting on the owning team",
      "Community over hierarchy: Decisions are made through code review and technical merit rather than organizational reporting lines"
    ],
    "core_concepts_zh": [
      "受信任的提交者：拥有团队中的指定评审者，确保外部贡献满足质量标准和项目方向",
      "贡献指南：明确的文档（CONTRIBUTING.md、编码标准、PR 模板），降低跨团队贡献的门槛",
      "透明开发：所有代码、问题、路线图和设计决策对整个组织可见，模拟开源的透明性",
      "客座贡献者：来自消费团队的工程师提交补丁以修复 Bug 或添加其所需功能，而非等待拥有团队",
      "社区优先于层级：通过代码评审和技术优势做出决策，而非依赖组织汇报关系"
    ],
    "timeline": [
      [
        "2000",
        "Tim O'Reilly coins the term 'inner source' to describe applying open-source practices inside corporations"
      ],
      [
        "2014",
        "Danese Cooper formalizes inner-source practices at PayPal and begins advocating across the industry"
      ],
      [
        "2015",
        "InnerSource Commons community launches, providing patterns, maturity models, and case studies"
      ],
      [
        "2018",
        "Microsoft reports thousands of inner-source repositories across the company, crediting DevOps acceleration"
      ],
      [
        "2023",
        "AI-assisted code review tools (GitHub Copilot for PRs) lower the friction of cross-team contributions further"
      ]
    ],
    "timeline_zh": [
      [
        "2000",
        "Tim O'Reilly 创造「内部开源」一词，描述在企业内部应用开源实践"
      ],
      [
        "2014",
        "Danese Cooper 在 PayPal 正式化内部开源实践，并开始在业界推广"
      ],
      [
        "2015",
        "InnerSource Commons 社区成立，提供模式、成熟度模型和案例研究"
      ],
      [
        "2018",
        "Microsoft 报告公司内部有数千个内部开源仓库，并归功于 DevOps 加速"
      ],
      [
        "2023",
        "AI 辅助代码评审工具（如 GitHub Copilot for PRs）进一步降低跨团队贡献的摩擦"
      ]
    ],
    "dos": [
      "Invest in excellent documentation and onboarding guides for each inner-source project to reduce contributor friction",
      "Recognize and reward cross-team contributions in performance reviews to incentivize participation",
      "Start with 2-3 high-visibility pilot projects before attempting organization-wide inner-source adoption",
      "Ensure trusted committers have dedicated time for reviewing external contributions, not just their own team's work"
    ],
    "dos_zh": [
      "为每个内部开源项目投资优秀的文档和上手指南，减少贡献者摩擦",
      "在绩效评审中认可并奖励跨团队贡献，激励参与",
      "在尝试全组织推广前，先从 2-3 个高可见度的试点项目开始",
      "确保受信任的提交者有专门时间评审外部贡献，而非仅处理自身团队的工作"
    ],
    "donts": [
      "Don't mandate inner-source without cultural readiness — forced transparency in a low-trust environment backfires",
      "Don't neglect the trusted committer role — without timely reviews, contributors lose motivation and revert to forking",
      "Don't assume inner-source eliminates the need for owning teams — someone must still set direction and maintain quality",
      "Don't ignore the overhead: reviewing external PRs takes real time, and owning teams need staffing for this"
    ],
    "donts_zh": [
      "不要在文化尚未准备好的情况下强制推行内部开源——在低信任环境中强制透明会适得其反",
      "不要忽视受信任的提交者角色——没有及时的评审，贡献者会失去动力并转向分叉",
      "不要以为内部开源可以消除拥有团队的需要——仍需有人制定方向和维持质量",
      "不要忽略开销：评审外部 PR 需要实际时间，拥有团队需要为此配备人员"
    ],
    "case_study_company": "Microsoft",
    "case_study": "Microsoft's inner-source adoption accelerated after Satya Nadella's cultural transformation beginning in 2014. By 2018, over 40,000 engineers contributed to inner-source repositories across Azure, Office, and Windows divisions. The 1ES (One Engineering System) initiative standardized tooling that made cross-team contributions frictionless. Teams reported 40% faster resolution of cross-cutting bugs because consuming teams could fix issues themselves rather than filing tickets and waiting. The shift was credited as a key enabler of Microsoft's transition from the infamous 'stack ranking' culture to a collaborative growth mindset.",
    "case_study_zh": "Microsoft 的内部开源采纳在 Satya Nadella 于 2014 年启动的文化转型后加速推进。到 2018 年，超过 40,000 名工程师在 Azure、Office 和 Windows 部门的内部开源仓库中做出贡献。1ES（统一工程系统）计划标准化了工具链，使跨团队贡献无摩擦。团队报告跨领域 Bug 的解决速度提高了 40%，因为消费团队可以自行修复问题而非提交工单等待。这一转变被认为是 Microsoft 从臭名昭著的「末位排名」文化向协作成长思维转型的关键推动力。",
    "when_not_to_use": [
      "Organizations with strong intellectual property barriers between divisions that prevent code sharing",
      "Very early-stage startups where everyone already works in the same codebase and formal contribution processes add unnecessary overhead",
      "Teams with security-critical code that requires restricted access and cannot be opened to broad internal audiences",
      "Environments lacking basic DevOps maturity (CI/CD, code review tooling) needed to support distributed contribution"
    ],
    "when_not_to_use_zh": [
      "各部门间存在严格知识产权壁垒、阻碍代码共享的组织",
      "非常早期的创业公司，所有人已在同一代码库中工作，正式贡献流程增加不必要的开销",
      "拥有安全关键代码的团队，需要限制访问权限，无法对广泛的内部受众开放",
      "缺乏支持分布式贡献所需的基础 DevOps 成熟度（CI/CD、代码评审工具）的环境"
    ],
    "adopters": [
      "Microsoft",
      "PayPal",
      "Bloomberg",
      "Bosch",
      "SAP"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability",
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Stol, K.-J. et al. (2014). \"Key Factors for Adopting Inner Source\". ACM Transactions on Software Engineering and Methodology, 23(2).",
    "secondary_sources": [
      "Cooper, D. & Stol, K.-J. (2015). \"Adopting InnerSource: Principles and Case Studies\". O'Reilly Media.",
      "InnerSource Commons Foundation (2020). \"InnerSource Patterns\". innersourcecommons.org."
    ],
    "typed_relations": [
      {
        "slug": "platform-engineering",
        "type": "complement"
      },
      {
        "slug": "conways-law",
        "type": "related"
      },
      {
        "slug": "team-topologies",
        "type": "complement"
      },
      {
        "slug": "amazon-two-pizza-teams",
        "type": "complement"
      }
    ]
  },
  {
    "id": 144,
    "name": "Platform Engineering",
    "name_zh": "平台工程",
    "slug": "platform-engineering",
    "category": "team",
    "desc": "Build and operate an internal developer platform as a product, enabling stream-aligned teams to self-serve infrastructure and tooling",
    "desc_zh": "将内部开发者平台作为产品来构建和运营，使面向价值流的团队能够自助使用基础设施和工具",
    "steps": [
      "Identify the most common developer pain points: slow environment setup, deployment friction, observability gaps",
      "Form a dedicated platform team that treats internal developers as their customers",
      "Build a thin, self-service platform layer (golden paths) that abstracts infrastructure complexity",
      "Measure platform success through adoption rates, developer satisfaction, and time-to-first-deploy for new services",
      "Iterate on platform capabilities based on continuous user research with stream-aligned teams"
    ],
    "steps_zh": [
      "识别最常见的开发者痛点：环境搭建缓慢、部署摩擦、可观测性缺口",
      "组建专门的平台团队，将内部开发者视为客户",
      "构建轻量、自助的平台层（黄金路径），抽象基础设施复杂性",
      "通过采纳率、开发者满意度和新服务首次部署时间衡量平台成效",
      "基于对面向价值流团队的持续用户调研，迭代平台能力"
    ],
    "ai_relevant": true,
    "viz_type": "tree",
    "viz_labels": [
      "Pain Points",
      "Platform Team",
      "Golden Paths",
      "Adoption Metrics",
      "Iteration"
    ],
    "viz_labels_zh": [
      "痛点",
      "平台团队",
      "黄金路径",
      "采用度",
      "持续迭代"
    ],
    "related": [
      "team-topologies",
      "inner-source",
      "developer-experience-framework",
      "amazon-two-pizza-teams"
    ],
    "tags": [
      "platform",
      "developer-experience",
      "self-service",
      "internal-tools",
      "golden-path"
    ],
    "origin_author": "Evan Bottcher (ThoughtWorks), ~2018; formalized in Team Topologies (Skelton & Pais, 2019)",
    "origin_source": "Team Topologies (Skelton & Pais, 2019); CNCF Platform Engineering Maturity Model (2023)",
    "origin_source_zh": "《团队拓扑》（Skelton & Pais，2019）；CNCF 平台工程成熟度模型（2023）",
    "complexity": "advanced",
    "when_to_use": [
      "Organizations where each team spends significant time on undifferentiated infrastructure work",
      "Scaling beyond 10 stream-aligned teams that all need consistent CI/CD, monitoring, and deployment",
      "Reducing cognitive load on product teams by abstracting away Kubernetes, cloud, and networking complexity",
      "Accelerating onboarding so new engineers can deploy to production within their first week"
    ],
    "when_to_use_zh": [
      "组织中每个团队在无差别的基础设施工作上花费大量时间",
      "扩展到 10 个以上面向价值流的团队，都需要一致的 CI/CD、监控和部署",
      "通过抽象 Kubernetes、云和网络复杂性来减轻产品团队的认知负荷",
      "加速新人入职，使新工程师在第一周内就能部署到生产环境"
    ],
    "core_concepts": [
      "Platform as a Product: The internal platform is built with product management rigor — user research, roadmaps, SLOs, and feedback loops — not just as an infrastructure project",
      "Golden Paths: Opinionated, well-supported default workflows that cover 80% of use cases, while still allowing teams to go off-path when needed",
      "Self-Service: Developers provision environments, deploy services, and access logs through APIs and portals without filing tickets or waiting for ops teams",
      "Thin Platform: The platform abstracts complexity but does not hide it entirely — developers can still inspect and understand the underlying infrastructure when debugging",
      "Cognitive Load Reduction: The primary metric of platform success is how much mental overhead it removes from stream-aligned teams, as defined in Team Topologies"
    ],
    "core_concepts_zh": [
      "平台即产品：内部平台以产品管理的严谨度来构建——用户调研、路线图、SLO 和反馈循环——而非仅作为基础设施项目",
      "黄金路径：有主见的、充分支持的默认工作流，覆盖 80% 的用例，同时仍允许团队在需要时脱离路径",
      "自助服务：开发者通过 API 和门户自行配置环境、部署服务和访问日志，无需提工单或等待运维团队",
      "轻量平台：平台抽象复杂性但不完全隐藏——开发者在调试时仍可检查和理解底层基础设施",
      "认知负荷降低：平台成功的首要衡量标准是它为面向价值流团队减少了多少心智负担，正如《团队拓扑》中所定义的"
    ],
    "timeline": [
      [
        "2018",
        "Evan Bottcher's 'What I Talk About When I Talk About Platforms' article defines the internal platform as a product concept"
      ],
      [
        "2019",
        "Skelton and Pais formalize the platform team as one of four fundamental team types in Team Topologies"
      ],
      [
        "2022",
        "Gartner names platform engineering a top strategic technology trend, predicting 80% of organizations will adopt it by 2026"
      ],
      [
        "2023",
        "CNCF publishes the Platform Engineering Maturity Model; PlatformCon attracts 20,000+ attendees"
      ],
      [
        "2024",
        "Backstage (Spotify), Port, and Humanitec emerge as leading internal developer portal frameworks"
      ]
    ],
    "timeline_zh": [
      [
        "2018",
        "Evan Bottcher 的文章《当我谈论平台时我在谈论什么》定义了内部平台即产品的概念"
      ],
      [
        "2019",
        "Skelton 和 Pais 在《团队拓扑》中将平台团队正式确立为四种基本团队类型之一"
      ],
      [
        "2022",
        "Gartner 将平台工程列为顶级战略技术趋势，预测到 2026 年 80% 的组织将采纳"
      ],
      [
        "2023",
        "CNCF 发布平台工程成熟度模型；PlatformCon 吸引超过 20,000 名参与者"
      ],
      [
        "2024",
        "Backstage（Spotify）、Port 和 Humanitec 成为领先的内部开发者门户框架"
      ]
    ],
    "dos": [
      "Treat internal developers as real customers: conduct user interviews, track NPS, and maintain a public roadmap",
      "Start with the highest-friction developer workflow and build the first golden path there",
      "Provide an escape hatch — let teams customize or bypass the platform when their use case genuinely requires it",
      "Staff the platform team with experienced engineers who understand both infrastructure and developer workflows"
    ],
    "dos_zh": [
      "将内部开发者视为真正的客户：进行用户访谈、跟踪 NPS、维护公开路线图",
      "从摩擦最大的开发者工作流开始，在那里构建第一条黄金路径",
      "提供逃生通道——当团队的用例确实需要时，允许其自定义或绕过平台",
      "为平台团队配备既理解基础设施又理解开发者工作流的资深工程师"
    ],
    "donts": [
      "Don't build a 'ticket ops' platform where developers still need to file requests and wait — that is not self-service",
      "Don't mandate platform adoption through policy instead of value — teams should choose the platform because it is genuinely better",
      "Don't over-abstract: a platform that hides too much makes debugging impossible and frustrates experienced engineers",
      "Don't treat the platform as a one-time project — it requires continuous product investment like any customer-facing product"
    ],
    "donts_zh": [
      "不要构建「工单运维」式平台，开发者仍需提交请求并等待——那不是自助服务",
      "不要通过政策而非价值来强制采纳平台——团队应该因为平台真正更好而选择它",
      "不要过度抽象：隐藏太多内容的平台让调试变得不可能，令资深工程师感到沮丧",
      "不要将平台视为一次性项目——它需要像任何面向客户的产品一样持续投资"
    ],
    "case_study_company": "Spotify",
    "case_study": "Spotify built Backstage, an open-source internal developer portal, starting in 2016 to solve the problem of fragmented tooling across 300+ engineering teams. Backstage provides a unified service catalog, software templates, and plugin architecture that standardizes how teams create, discover, and manage services. By 2020, Spotify reported that new engineers could deploy to production in their first week (previously it took over a month). They open-sourced Backstage in 2020, and it became a CNCF Incubating project in 2022, adopted by hundreds of organizations worldwide.",
    "case_study_zh": "Spotify 从 2016 年开始构建 Backstage（开源内部开发者门户），以解决 300 多个工程团队工具碎片化的问题。Backstage 提供统一的服务目录、软件模板和插件架构，标准化团队创建、发现和管理服务的方式。到 2020 年，Spotify 报告新工程师可以在第一周内部署到生产环境（此前需要一个多月）。他们于 2020 年将 Backstage 开源，2022 年成为 CNCF 孵化项目，被全球数百个组织采纳。",
    "when_not_to_use": [
      "Organizations with fewer than 5 engineering teams — the overhead of a dedicated platform team is not justified",
      "Early-stage startups where infrastructure needs are simple and a PaaS like Heroku or Vercel suffices",
      "Environments where teams have radically different technology stacks that resist standardization",
      "Organizations that lack executive support for sustained investment in internal tooling"
    ],
    "when_not_to_use_zh": [
      "工程团队少于 5 个的组织——专门平台团队的开销不合理",
      "早期创业公司，基础设施需求简单，Heroku 或 Vercel 等 PaaS 即可满足",
      "各团队技术栈差异极大且抗拒标准化的环境",
      "缺乏高管对内部工具持续投资支持的组织"
    ],
    "adopters": [
      "Spotify",
      "Netflix",
      "Zalando",
      "Mercado Libre",
      "Deutsche Telekom"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability",
      "usability",
      "scalability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Skelton, M. & Pais, M. (2019). \"Team Topologies: Organizing Business and Technology Teams for Fast Flow\". IT Revolution Press.",
    "secondary_sources": [
      "Bottcher, E. (2018). \"What I Talk About When I Talk About Platforms\". martinfowler.com.",
      "CNCF (2023). \"Platform Engineering Maturity Model\". tag-app-delivery.cncf.io."
    ],
    "typed_relations": [
      {
        "slug": "team-topologies",
        "type": "complement"
      },
      {
        "slug": "inner-source",
        "type": "complement"
      },
      {
        "slug": "developer-experience-framework",
        "type": "complement"
      },
      {
        "slug": "amazon-two-pizza-teams",
        "type": "complement"
      }
    ]
  },
  {
    "id": 145,
    "name": "Engineering Ladder / Career Framework",
    "name_zh": "工程师阶梯 / 职业发展框架",
    "slug": "engineering-ladder",
    "category": "team",
    "desc": "Define structured growth paths with clear expectations at each level for individual contributors and engineering managers",
    "desc_zh": "为个人贡献者和工程管理者定义结构化的成长路径，明确每个层级的期望",
    "steps": [
      "Define distinct levels (e.g., L1-L7 or Junior to Distinguished) with clear scope, impact, and autonomy expectations",
      "Create parallel IC (individual contributor) and management tracks with equivalent levels and compensation",
      "Describe observable behaviors and artifacts for each level across dimensions: technical skill, leadership, communication, delivery",
      "Establish a calibration process where managers align on level expectations across teams",
      "Review and update the framework annually based on organizational evolution and employee feedback"
    ],
    "steps_zh": [
      "定义不同级别（如 L1-L7 或初级到杰出级），明确每级的职责范围、影响力和自主权期望",
      "创建并行的个人贡献者（IC）和管理轨道，具有等效的级别和薪酬",
      "在技术技能、领导力、沟通、交付等维度为每个级别描述可观察的行为和产出物",
      "建立校准流程，使各团队管理者在级别期望上达成一致",
      "每年根据组织演变和员工反馈审查并更新框架"
    ],
    "ai_relevant": false,
    "viz_type": "pyramid",
    "viz_labels": [
      "IC Track",
      "Management Track",
      "Level Behaviors",
      "Calibration",
      "Annual Review"
    ],
    "viz_labels_zh": [
      "IC通道",
      "管理通道",
      "级别行为",
      "校准",
      "年度评审"
    ],
    "related": [
      "developer-experience-framework",
      "blameless-postmortems",
      "team-topologies"
    ],
    "tags": [
      "career",
      "growth",
      "engineering-levels",
      "management",
      "retention"
    ],
    "origin_author": "Multiple origins; notably Rent the Runway (Camille Fournier, ~2015) and early frameworks at Google and Facebook",
    "origin_source": "The Manager's Path (Camille Fournier, 2017); referenced in Accelerate (Forsgren, Humble & Kim, 2018) and An Elegant Puzzle (Will Larson, 2019)",
    "origin_source_zh": "《管理之路》（Camille Fournier，2017）；在《加速》（Forsgren、Humble & Kim，2018）和《优雅的谜题》（Will Larson，2019）中有引用",
    "complexity": "intermediate",
    "when_to_use": [
      "Engineering organizations growing beyond 20-30 engineers where ad-hoc promotion decisions become inconsistent",
      "Companies struggling to retain senior ICs because the only growth path leads to management",
      "Organizations needing to align compensation and expectations across distributed teams and offices",
      "Teams where engineers express uncertainty about what is expected of them to advance"
    ],
    "when_to_use_zh": [
      "工程组织增长到 20-30 人以上，临时性的晋升决策变得不一致",
      "公司难以留住资深 IC，因为唯一的成长路径指向管理岗",
      "组织需要在分布式团队和办公室之间对齐薪酬和期望",
      "工程师对晋升所需达到的期望感到不确定的团队"
    ],
    "core_concepts": [
      "Dual Track: Parallel IC and management ladders with equivalent prestige and compensation, so engineers are not forced into management to advance",
      "Scope and Impact: Higher levels are distinguished by increasing scope (team → org → company → industry) and the ambiguity of problems tackled",
      "Observable Behaviors: Level expectations are defined as concrete, observable actions rather than vague traits — 'designs systems used by 3+ teams' rather than 'is technical'",
      "Calibration: Regular cross-team sessions where managers compare employees at the same level to ensure consistent standards",
      "Anti-Peter Principle: Clear expectations help identify whether someone is performing at their current level, preventing promotion to incompetence"
    ],
    "core_concepts_zh": [
      "双轨制：并行的 IC 和管理阶梯具有等同的声望和薪酬，工程师无需被迫转管理才能晋升",
      "职责范围与影响力：更高级别通过不断扩大的职责范围（团队→组织→公司→行业）和所解决问题的模糊性来区分",
      "可观察行为：级别期望定义为具体可观察的行动而非模糊特质——「设计被 3 个以上团队使用的系统」而非「有技术能力」",
      "校准：定期跨团队会议，管理者比较同一级别的员工以确保一致的标准",
      "反彼得原理：清晰的期望有助于判断某人是否在当前级别有效工作，防止因晋升导致能力不胜任"
    ],
    "timeline": [
      [
        "2005",
        "Google publishes early internal engineering level definitions (L3-L10) that become a widely copied template"
      ],
      [
        "2013",
        "Rent the Runway publishes one of the first public engineering ladders, later expanded by Camille Fournier"
      ],
      [
        "2017",
        "Camille Fournier's The Manager's Path provides comprehensive guidance on engineering levels and management tracks"
      ],
      [
        "2019",
        "Will Larson's An Elegant Puzzle addresses ladder design as a systems problem and popularizes calibration practices"
      ],
      [
        "2023",
        "Companies like Dropbox and CircleCI publish open-source career frameworks, creating industry-wide standardization"
      ]
    ],
    "timeline_zh": [
      [
        "2005",
        "Google 发布早期内部工程级别定义（L3-L10），成为广泛复制的模板"
      ],
      [
        "2013",
        "Rent the Runway 发布首批公开工程阶梯之一，后由 Camille Fournier 扩展"
      ],
      [
        "2017",
        "Camille Fournier 的《管理之路》提供了关于工程级别和管理轨道的全面指导"
      ],
      [
        "2019",
        "Will Larson 的《优雅的谜题》将阶梯设计视为系统问题，并推广了校准实践"
      ],
      [
        "2023",
        "Dropbox 和 CircleCI 等公司发布开源职业框架，推动行业范围的标准化"
      ]
    ],
    "dos": [
      "Make the ladder public within the organization so every engineer can self-assess and understand expectations",
      "Include concrete examples and anti-examples at each level to make expectations actionable",
      "Run calibration sessions across teams at least twice a year to prevent grade inflation or inconsistency",
      "Revisit the ladder as the organization grows — expectations for a Staff engineer at 100 people differ from one at 5,000"
    ],
    "dos_zh": [
      "在组织内公开阶梯，让每位工程师都能自我评估并理解期望",
      "在每个级别包含具体的正面和反面示例，使期望可操作",
      "每年至少两次跨团队进行校准会议，防止级别膨胀或不一致",
      "随组织成长重新审视阶梯——100 人时对 Staff 工程师的期望与 5,000 人时不同"
    ],
    "donts": [
      "Don't create too many levels — 5-7 IC levels is typical; more than 10 creates meaningless distinctions",
      "Don't use the ladder as a checklist — promotion should reflect sustained performance at the next level, not box-ticking",
      "Don't make management the only path to higher compensation, or you will lose your best technical talent",
      "Don't copy another company's ladder verbatim — Google's L3-L10 reflects Google's scale and culture, not yours"
    ],
    "donts_zh": [
      "不要创建过多级别——5-7 个 IC 级别是典型的；超过 10 个会产生无意义的区分",
      "不要将阶梯当作清单——晋升应反映在下一级别的持续表现，而非逐项打勾",
      "不要将管理岗作为获得更高薪酬的唯一路径，否则会流失最优秀的技术人才",
      "不要逐字复制其他公司的阶梯——Google 的 L3-L10 反映的是 Google 的规模和文化，不是你的"
    ],
    "case_study_company": "Rent the Runway",
    "case_study": "Rent the Runway, under CTO Camille Fournier, created one of the earliest publicly shared engineering ladders around 2013-2015. The framework defined five IC levels and a parallel management track, each with specific expectations across technical skill, ownership, and collaboration. Before the ladder, promotion decisions were inconsistent and engineers frequently left due to unclear growth paths. After implementation, engineering retention improved measurably, and the framework was so well-received that Fournier expanded the concepts into The Manager's Path (2017), which became the definitive guide on engineering career frameworks across the industry.",
    "case_study_zh": "Rent the Runway 在 CTO Camille Fournier 的领导下，于 2013-2015 年间创建了最早公开分享的工程阶梯之一。该框架定义了五个 IC 级别和一条并行管理轨道，每个级别在技术技能、所有权和协作方面有明确期望。在阶梯实施前，晋升决策不一致，工程师经常因成长路径不清晰而离职。实施后，工程师留存率显著提升，该框架获得极佳反响，Fournier 将这些概念扩展为《管理之路》（2017），成为业界工程职业框架的权威指南。",
    "when_not_to_use": [
      "Very early-stage startups (under 10 engineers) where roles are fluid and formal levels create unnecessary rigidity",
      "Organizations where all engineers are expected to be full-stack generalists with identical expectations",
      "Cultures that strongly prefer flat hierarchies and resist any form of level differentiation",
      "Temporary project teams or consultancies where long-term career growth within the organization is not the primary concern"
    ],
    "when_not_to_use_zh": [
      "非常早期的创业公司（10 名工程师以下），角色流动性强，正式级别会造成不必要的僵化",
      "所有工程师都被期望成为期望完全相同的全栈通才的组织",
      "强烈偏好扁平层级并抵制任何形式的级别差异化的文化",
      "临时项目团队或咨询公司，组织内的长期职业成长不是主要关切"
    ],
    "adopters": [
      "Google",
      "Meta",
      "Rent the Runway",
      "Dropbox",
      "Spotify"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability",
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Fournier, C. (2017). \"The Manager's Path: A Guide for Tech Leaders Navigating Growth and Change\". O'Reilly Media.",
    "secondary_sources": [
      "Larson, W. (2019). \"An Elegant Puzzle: Systems of Engineering Management\". Stripe Press.",
      "Reilly, T. (2022). \"The Staff Engineer's Path\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "developer-experience-framework",
        "type": "complement"
      },
      {
        "slug": "blameless-postmortems",
        "type": "complement"
      },
      {
        "slug": "team-topologies",
        "type": "complement"
      }
    ]
  },
  {
    "id": 146,
    "name": "Blameless Postmortems",
    "name_zh": "无指责事后复盘",
    "slug": "blameless-postmortems",
    "category": "team",
    "desc": "Conduct structured incident reviews focused on systemic learning rather than individual blame",
    "desc_zh": "进行结构化的事故复盘，聚焦于系统性学习而非个人指责",
    "steps": [
      "Document the incident timeline with objective facts: what happened, when, and what actions were taken",
      "Gather all participants and stakeholders for a facilitated review within 48 hours of incident resolution",
      "Analyze contributing factors using techniques like the '5 Whys' or fault tree analysis, focusing on systems not individuals",
      "Identify actionable remediation items with clear owners and deadlines, prioritized by impact",
      "Publish the postmortem internally for organizational learning and track remediation completion"
    ],
    "steps_zh": [
      "用客观事实记录事故时间线：发生了什么、何时发生、采取了什么行动",
      "在事故解决后 48 小时内召集所有参与者和利益相关者进行有引导的复盘",
      "使用「5 个为什么」或故障树分析等技术分析促成因素，关注系统而非个人",
      "识别可执行的补救项，明确负责人和截止日期，按影响优先排序",
      "在组织内部发布事后复盘报告以促进学习，并跟踪补救完成情况"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Timeline",
      "Facilitated Review",
      "5 Whys Analysis",
      "Action Items",
      "Publish Learnings"
    ],
    "viz_labels_zh": [
      "事件时间线",
      "复盘会议",
      "五问分析",
      "改进项",
      "发布学习"
    ],
    "related": [
      "technical-debt-management-framework",
      "developer-experience-framework",
      "team-topologies"
    ],
    "tags": [
      "incidents",
      "learning",
      "culture",
      "postmortem",
      "resilience",
      "SRE"
    ],
    "origin_author": "John Allspaw, 2012; rooted in Sidney Dekker's Just Culture and safety science",
    "origin_source": "Etsy's engineering blog (Allspaw, 2012); Site Reliability Engineering (Beyer et al., 2016); referenced in Accelerate (Forsgren, Humble & Kim, 2018)",
    "origin_source_zh": "Etsy 工程博客（Allspaw，2012）；《SRE：Google 运维解密》（Beyer 等，2016）；在《加速》（Forsgren、Humble & Kim，2018）中有引用",
    "complexity": "beginner",
    "when_to_use": [
      "After any production incident that caused user impact or SLA breach, regardless of severity",
      "Organizations trying to build a learning culture where engineers feel safe to take risks and report failures",
      "Teams experiencing repeated incidents from the same root causes, indicating failure to learn from past events",
      "Environments transitioning from a blame-oriented culture that suppresses incident reporting"
    ],
    "when_to_use_zh": [
      "任何导致用户影响或 SLA 违约的生产事故之后，不论严重程度",
      "试图建立学习文化的组织，使工程师感到安全地承担风险和报告故障",
      "团队因相同根因反复发生事故，表明未能从过去事件中学习",
      "从抑制事故报告的指责文化向学习文化过渡的环境"
    ],
    "core_concepts": [
      "Blamelessness: The explicit agreement that individuals will not be punished for honest mistakes — this is what makes people willing to share the full truth about what happened",
      "Systems thinking: Incidents are caused by systemic factors (missing guardrails, poor observability, flawed processes), not individual failures",
      "Counterfactual reasoning avoidance: Avoid 'if only X had done Y' statements — they are unfalsifiable and prevent systemic improvement",
      "Action items over apologies: The postmortem's value is measured by the quality and completion rate of remediation items, not by who apologized",
      "Organizational memory: Published postmortems become a searchable knowledge base that prevents institutional amnesia about past incidents"
    ],
    "core_concepts_zh": [
      "无指责：明确约定不会因诚实错误而惩罚个人——这使人们愿意分享事件发生的全部真相",
      "系统思维：事故由系统性因素导致（缺少防护栏、可观测性差、流程缺陷），而非个人失误",
      "避免反事实推理：避免「如果 X 当时做了 Y 就好了」的陈述——这些无法证伪且阻碍系统性改进",
      "行动项优于道歉：事后复盘的价值以补救项的质量和完成率衡量，而非谁道了歉",
      "组织记忆：已发布的事后复盘成为可搜索的知识库，防止组织对过去事故的记忆遗失"
    ],
    "timeline": [
      [
        "2004",
        "Sidney Dekker publishes 'Just Culture' in aviation safety, establishing the theoretical foundation for blame-free incident analysis"
      ],
      [
        "2012",
        "John Allspaw publishes 'Blameless PostMortems and a Just Culture' on the Etsy engineering blog, adapting safety science to software"
      ],
      [
        "2014",
        "PagerDuty open-sources their incident response and postmortem documentation, establishing industry templates"
      ],
      [
        "2016",
        "Google's SRE book dedicates a chapter to postmortem culture, making blamelessness a mainstream SRE practice"
      ],
      [
        "2018",
        "Forsgren, Humble and Kim identify a generative (blame-free) culture as a key predictor of high-performing teams in Accelerate"
      ]
    ],
    "timeline_zh": [
      [
        "2004",
        "Sidney Dekker 发表航空安全领域的「公正文化」理论，为无指责事故分析奠定理论基础"
      ],
      [
        "2012",
        "John Allspaw 在 Etsy 工程博客发表《无指责事后复盘与公正文化》，将安全科学引入软件领域"
      ],
      [
        "2014",
        "PagerDuty 开源其事故响应和事后复盘文档，建立行业模板"
      ],
      [
        "2016",
        "Google 的 SRE 书籍专设一章讨论事后复盘文化，使无指责成为主流 SRE 实践"
      ],
      [
        "2018",
        "Forsgren、Humble 和 Kim 在《加速》中将生成型（无指责）文化确定为高绩效团队的关键预测因子"
      ]
    ],
    "dos": [
      "Facilitate the postmortem with a neutral party who was not directly involved in the incident",
      "Document the timeline with precise timestamps and objective facts before the meeting to ground the discussion",
      "Follow up rigorously on action items — a postmortem without completed remediation is organizational theater",
      "Celebrate postmortems as learning events, not punishment rituals — some teams award 'best postmortem' recognition"
    ],
    "dos_zh": [
      "由未直接参与事故的中立方主持事后复盘",
      "在会议前用精确时间戳和客观事实记录时间线，为讨论奠定基础",
      "严格跟进行动项——没有完成补救的事后复盘是组织表演",
      "将事后复盘视为学习活动而非惩罚仪式——一些团队会颁发「最佳事后复盘」认可"
    ],
    "donts": [
      "Don't allow the postmortem to devolve into finger-pointing — the facilitator must redirect 'who' questions to 'what' and 'why' questions",
      "Don't skip the postmortem for 'small' incidents — small failures are the cheapest learning opportunities",
      "Don't let action items languish in a backlog forever — set deadlines and review completion weekly",
      "Don't confuse blameless with accountability-free — the process is blame-free, but systemic issues must still be fixed"
    ],
    "donts_zh": [
      "不要让事后复盘演变为互相指责——主持人必须将「谁」的问题重新引导为「什么」和「为什么」的问题",
      "不要因「小」事故就跳过事后复盘——小故障是成本最低的学习机会",
      "不要让行动项永远滞留在待办事项中——设定截止日期并每周审查完成情况",
      "不要将无指责与无责任混淆——流程是无指责的，但系统性问题仍必须修复"
    ],
    "case_study_company": "Etsy",
    "case_study": "Etsy, under CTO John Allspaw (2011-2015), became the poster child for blameless postmortems in software engineering. When a database engineer accidentally dropped a production table in 2012, instead of punishment, Allspaw facilitated a blameless postmortem that revealed the deployment tooling lacked safeguards against destructive operations. The resulting action items — adding confirmation prompts, read-only replicas for queries, and automated backups — prevented similar incidents. Allspaw published the approach on Etsy's engineering blog, and it became the foundation for modern incident management practices industry-wide. Etsy reported that their incident recurrence rate dropped by over 50% within a year of adopting consistent blameless postmortems.",
    "case_study_zh": "Etsy 在 CTO John Allspaw（2011-2015 在任）的带领下，成为软件工程领域无指责事后复盘的典范。2012 年一位数据库工程师意外删除了生产表，Allspaw 没有进行惩罚，而是主持了无指责事后复盘，揭示了部署工具缺少针对破坏性操作的防护措施。由此产生的行动项——添加确认提示、用于查询的只读副本和自动备份——防止了类似事故。Allspaw 在 Etsy 工程博客上发表了这一方法，成为全行业现代事故管理实践的基础。Etsy 报告在持续采用无指责事后复盘的一年内，事故复发率下降了 50% 以上。",
    "when_not_to_use": [
      "Incidents caused by deliberate malicious action (security breaches) where accountability, not blamelessness, is the appropriate response",
      "Organizations where leadership has not genuinely committed to blame-free culture — superficial adoption breeds cynicism",
      "Trivial issues that do not warrant the overhead of a formal postmortem process (use lightweight retros instead)",
      "Contexts where regulatory compliance requires formal root cause attribution to specific individuals or roles"
    ],
    "when_not_to_use_zh": [
      "由故意恶意行为（安全漏洞）导致的事故，此时问责而非无指责才是适当的回应",
      "领导层未真正承诺无指责文化的组织——表面采纳会滋生犬儒主义",
      "不值得正式事后复盘流程开销的琐碎问题（改用轻量级回顾替代）",
      "法规合规要求将根因正式归因于特定个人或角色的场景"
    ],
    "adopters": [
      "Etsy",
      "Google",
      "PagerDuty",
      "Netflix",
      "Atlassian"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Allspaw, J. (2012). \"Blameless PostMortems and a Just Culture\". Etsy Code as Craft Blog.",
    "secondary_sources": [
      "Beyer, B. et al. (2016). \"Site Reliability Engineering: How Google Runs Production Systems\", Ch. 15. O'Reilly Media.",
      "Dekker, S. (2012). \"Just Culture: Restoring Trust and Accountability in Your Organization\", 2nd ed. CRC Press."
    ],
    "typed_relations": [
      {
        "slug": "technical-debt-management-framework",
        "type": "complement"
      },
      {
        "slug": "developer-experience-framework",
        "type": "complement"
      },
      {
        "slug": "team-topologies",
        "type": "complement"
      }
    ]
  },
  {
    "id": 147,
    "name": "Architecture Review Board (ARB)",
    "name_zh": "架构评审委员会",
    "slug": "architecture-review-board",
    "category": "team",
    "desc": "A governance body that reviews and guides significant architectural decisions to ensure consistency, quality, and strategic alignment",
    "desc_zh": "审查和指导重大架构决策的治理机构，确保一致性、质量和战略对齐",
    "steps": [
      "Charter the ARB with a clear mandate: which decisions require review (e.g., new services, technology introductions, major API changes)",
      "Compose the board with senior architects and rotating members from stream-aligned teams to avoid ivory-tower syndrome",
      "Define a lightweight submission process (RFC or Architecture Decision Record) that teams complete before the review",
      "Conduct reviews as collaborative design sessions, not approval gates — the ARB advises, teams decide",
      "Publish all decisions and rationale transparently to build organizational architectural knowledge"
    ],
    "steps_zh": [
      "为 ARB 制定明确的章程：哪些决策需要评审（如新服务、引入新技术、重大 API 变更）",
      "由资深架构师和来自面向价值流团队的轮换成员组成委员会，避免象牙塔综合症",
      "定义轻量级的提交流程（RFC 或架构决策记录），团队在评审前完成",
      "将评审作为协作设计会议而非审批关卡——ARB 提供建议，团队做出决定",
      "透明发布所有决策和理由，构建组织的架构知识"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Mandate",
      "Board Composition",
      "RFC Submission",
      "Collaborative Review",
      "Publish Decision"
    ],
    "viz_labels_zh": [
      "职责范围",
      "委员会组成",
      "RFC提交",
      "协作评审",
      "决策发布"
    ],
    "related": [
      "conways-law",
      "technical-debt-management-framework",
      "team-topologies",
      "continuous-architecture"
    ],
    "tags": [
      "governance",
      "architecture",
      "review-board",
      "ADR",
      "decision-making"
    ],
    "origin_author": "Enterprise architecture tradition; modernized by ThoughtWorks Technology Radar approach (~2010)",
    "origin_source": "Software Architecture in Practice (Bass, Clements & Kazman, 3rd ed., 2012); referenced in Fundamentals of Software Architecture (Richards & Ford, 2020)",
    "origin_source_zh": "《软件架构实践》（Bass、Clements & Kazman，第三版，2012）；在《软件架构基础》（Richards & Ford，2020）中有引用",
    "complexity": "intermediate",
    "when_to_use": [
      "Organizations with 10+ teams where architectural consistency across services is critical",
      "Environments where costly architectural mistakes (wrong database choice, incompatible API styles) have occurred repeatedly",
      "Regulated industries where architectural decisions need documented rationale for compliance audits",
      "Growing organizations transitioning from a single architect to distributed architectural decision-making"
    ],
    "when_to_use_zh": [
      "拥有 10 个以上团队、跨服务架构一致性至关重要的组织",
      "高成本架构错误（错误的数据库选择、不兼容的 API 风格）反复发生的环境",
      "受监管行业中架构决策需要记录理由以通过合规审计",
      "从单一架构师向分布式架构决策转型的成长型组织"
    ],
    "core_concepts": [
      "Advisory not gatekeeping: Modern ARBs advise and influence rather than approve or block — they provide guardrails, not gates",
      "Architecture Decision Records (ADRs): Lightweight documents capturing the context, decision, and consequences of each significant architectural choice",
      "Rotating membership: Including engineers from delivery teams prevents the ARB from becoming disconnected from implementation reality",
      "Decision scope: The ARB only reviews decisions above a defined threshold (cross-team impact, new technology, security implications), not every design choice",
      "Organizational memory: Published ADRs create a searchable history of why architectural decisions were made, preventing repeated debates"
    ],
    "core_concepts_zh": [
      "建议而非关卡：现代 ARB 提供建议和影响而非审批或阻拦——它们提供护栏而非关卡",
      "架构决策记录（ADR）：捕获每个重大架构选择的上下文、决策和后果的轻量级文档",
      "轮换成员制：纳入来自交付团队的工程师，防止 ARB 与实现现实脱节",
      "决策范围：ARB 仅评审超过定义阈值的决策（跨团队影响、新技术、安全影响），而非每个设计选择",
      "组织记忆：已发布的 ADR 创建了架构决策理由的可搜索历史，防止重复讨论"
    ],
    "timeline": [
      [
        "1995",
        "The Zachman Framework and TOGAF establish formal enterprise architecture governance including review boards"
      ],
      [
        "2010",
        "ThoughtWorks Technology Radar introduces a lightweight, collaborative approach to architectural governance"
      ],
      [
        "2011",
        "Michael Nygard proposes Architecture Decision Records as a lightweight alternative to heavy governance documents"
      ],
      [
        "2019",
        "Team Topologies advocates for enabling teams (including architecture guidance) rather than traditional gatekeeping ARBs"
      ],
      [
        "2023",
        "Organizations adopt async ARB reviews via ADRs in GitHub/GitLab, reducing meeting overhead while maintaining governance"
      ]
    ],
    "timeline_zh": [
      [
        "1995",
        "Zachman 框架和 TOGAF 建立正式的企业架构治理（包括评审委员会）"
      ],
      [
        "2010",
        "ThoughtWorks 技术雷达引入轻量级协作式架构治理方法"
      ],
      [
        "2011",
        "Michael Nygard 提出架构决策记录作为重量级治理文档的轻量替代"
      ],
      [
        "2019",
        "《团队拓扑》倡导赋能团队（包括架构指导）而非传统关卡式 ARB"
      ],
      [
        "2023",
        "组织通过 GitHub/GitLab 中的 ADR 进行异步 ARB 评审，减少会议开销同时保持治理"
      ]
    ],
    "dos": [
      "Keep the review process lightweight — a one-page ADR template is better than a 30-slide architecture deck",
      "Include rotating members from delivery teams so the ARB stays grounded in real-world implementation constraints",
      "Publish all ADRs and review outcomes transparently so the entire organization benefits from the architectural reasoning",
      "Define clear criteria for what requires ARB review and what does not, to avoid becoming a bottleneck"
    ],
    "dos_zh": [
      "保持评审流程轻量——一页 ADR 模板优于 30 页架构演示文稿",
      "纳入来自交付团队的轮换成员，使 ARB 扎根于真实实现约束",
      "透明发布所有 ADR 和评审结果，使整个组织受益于架构推理",
      "明确定义哪些需要 ARB 评审、哪些不需要，避免成为瓶颈"
    ],
    "donts": [
      "Don't let the ARB become an ivory tower that dictates architecture without understanding delivery realities",
      "Don't require ARB approval for every technical decision — focus only on high-impact, cross-cutting choices",
      "Don't staff the ARB exclusively with senior architects — include hands-on engineers who write code daily",
      "Don't make the ARB a veto gate — if it blocks teams without offering alternatives, it destroys autonomy and trust"
    ],
    "donts_zh": [
      "不要让 ARB 成为不了解交付现实却指挥架构的象牙塔",
      "不要要求每个技术决策都经 ARB 审批——仅关注高影响力的跨领域选择",
      "不要让 ARB 成员全部由资深架构师组成——要包含每天写代码的一线工程师",
      "不要让 ARB 成为否决关卡——如果它阻止团队却不提供替代方案，就会摧毁自主权和信任"
    ],
    "case_study_company": "Spotify",
    "case_study": "Spotify evolved its architectural governance from a traditional ARB to a more collaborative model around 2016. Instead of a central board that approved or rejected designs, they established 'System Owners' — senior engineers who owned specific architectural domains (data platform, messaging, etc.) and reviewed ADRs asynchronously via pull requests. Any engineer could propose an architectural change by opening an ADR PR and tagging relevant system owners. This approach reduced review latency from weeks to days, increased the number of architecturally significant decisions documented by 4x, and maintained consistency without creating bottlenecks — a model later reinforced by Team Topologies' concept of enabling teams.",
    "case_study_zh": "Spotify 在 2016 年前后将其架构治理从传统 ARB 演进为更协作的模型。他们没有设立审批或拒绝设计的中央委员会，而是建立了「系统负责人」——负责特定架构领域（数据平台、消息传递等）的资深工程师，通过拉取请求异步评审 ADR。任何工程师都可以通过开启 ADR PR 并标记相关系统负责人来提出架构变更。这种方法将评审延迟从数周缩短到数天，使记录在案的架构重要决策数量增加了 4 倍，在不产生瓶颈的情况下保持了一致性——这一模型后来被《团队拓扑》中赋能团队的概念所强化。",
    "when_not_to_use": [
      "Small teams (under 3 teams) where informal architectural discussions in standups or Slack are sufficient",
      "Fast-moving startups where the overhead of formal reviews would slow down critical time-to-market",
      "Organizations with a single technology stack where architectural decisions are inherently constrained",
      "Teams practicing extreme programming with emergent architecture that resists up-front architectural planning"
    ],
    "when_not_to_use_zh": [
      "小型团队（3 个以下），站会或 Slack 中的非正式架构讨论即可满足需求",
      "快速发展的创业公司，正式评审的开销会拖慢关键的上市时间",
      "使用单一技术栈的组织，架构决策本身就受到固有约束",
      "实践极限编程、采用浮现式架构的团队，不适合前置的架构规划"
    ],
    "adopters": [
      "Spotify",
      "Google",
      "ThoughtWorks",
      "Capital One",
      "Zalando"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Bass, L., Clements, P. & Kazman, R. (2012). \"Software Architecture in Practice\", 3rd ed. Addison-Wesley.",
    "secondary_sources": [
      "Richards, M. & Ford, N. (2020). \"Fundamentals of Software Architecture\". O'Reilly Media.",
      "Keeling, M. (2017). \"Design It! From Programmer to Software Architect\". Pragmatic Bookshelf."
    ],
    "typed_relations": [
      {
        "slug": "conways-law",
        "type": "complement"
      },
      {
        "slug": "technical-debt-management-framework",
        "type": "complement"
      },
      {
        "slug": "team-topologies",
        "type": "complement"
      },
      {
        "slug": "continuous-architecture",
        "type": "complement"
      }
    ]
  },
  {
    "id": 148,
    "name": "Technical Debt Management Framework",
    "name_zh": "技术债务管理框架",
    "slug": "technical-debt-management-framework",
    "category": "team",
    "desc": "A systematic approach to identifying, quantifying, prioritizing, and paying down technical debt across an engineering organization",
    "desc_zh": "在工程组织中系统性地识别、量化、优先排序和偿还技术债务的方法",
    "steps": [
      "Create a shared tech debt inventory: catalog known debt items with their type (code, architecture, infrastructure, test), impact, and estimated remediation cost",
      "Quantify debt impact using metrics: developer friction (time lost), incident correlation, deployment frequency impact, and customer-facing effects",
      "Prioritize using a cost-of-delay model: rank debt items by the ongoing cost of not fixing them versus the one-time cost of remediation",
      "Allocate a consistent capacity budget (typically 15-20% of sprint capacity) dedicated to debt reduction each iteration",
      "Track debt trends over time with a tech debt dashboard and report progress to stakeholders quarterly"
    ],
    "steps_zh": [
      "创建共享的技术债务清单：按类型（代码、架构、基础设施、测试）、影响和估计修复成本记录已知债务项",
      "使用指标量化债务影响：开发者摩擦（浪费的时间）、事故相关性、部署频率影响和面向客户的效果",
      "使用延迟成本模型优先排序：按不修复的持续成本与一次性修复成本的比值对债务项排名",
      "每个迭代分配固定的容量预算（通常为冲刺容量的 15-20%）专门用于债务削减",
      "通过技术债务仪表板追踪债务趋势，每季度向利益相关者报告进展"
    ],
    "ai_relevant": true,
    "viz_type": "quadrant",
    "viz_labels": [
      "High Impact",
      "High Effort",
      "Low Impact",
      "Low Effort"
    ],
    "viz_labels_zh": [
      "高影响",
      "高成本",
      "低影响",
      "低成本"
    ],
    "related": [
      "technical-debt-quadrant",
      "blameless-postmortems",
      "architecture-review-board",
      "continuous-architecture"
    ],
    "tags": [
      "technical-debt",
      "sustainability",
      "prioritization",
      "engineering-excellence",
      "maintenance"
    ],
    "origin_author": "Ward Cunningham, 1992 (debt metaphor); Martin Fowler, 2009 (Technical Debt Quadrant)",
    "origin_source": "Technical Debt Quadrant (Fowler, 2009); The Mythical Man-Month (Brooks, 1975, on system entropy); Accelerate (Forsgren, Humble & Kim, 2018)",
    "origin_source_zh": "技术债务象限（Fowler，2009）；《人月神话》（Brooks，1975，关于系统熵增）；《加速》（Forsgren、Humble & Kim，2018）",
    "complexity": "intermediate",
    "when_to_use": [
      "Engineering velocity is declining despite stable team size, suggesting accumulated debt is creating drag",
      "Teams spend more than 30% of their time on unplanned rework, workarounds, or fighting fragile systems",
      "Product leadership needs a data-driven argument for investing in technical improvements over new features",
      "Post-acquisition or post-rapid-growth phases where shortcuts taken during growth need systematic remediation"
    ],
    "when_to_use_zh": [
      "在团队规模稳定的情况下工程速度下降，表明累积的债务正在产生阻力",
      "团队超过 30% 的时间花在计划外返工、变通方案或处理脆弱系统上",
      "产品领导层需要数据驱动的论据来支持投资技术改进而非新功能",
      "收购后或快速增长后阶段，增长期间的捷径需要系统性修复"
    ],
    "core_concepts": [
      "Debt as a financial metaphor: Technical debt accrues 'interest' — the longer it remains, the more ongoing cost it imposes through slower development, more incidents, and harder onboarding",
      "Fowler's Quadrant: Debt is classified along two axes — deliberate/inadvertent and reckless/prudent — helping teams distinguish strategic shortcuts from careless code",
      "Capacity allocation: Rather than occasional 'tech debt sprints', sustainable teams reserve a consistent percentage of each iteration for debt reduction",
      "Cost of delay: The most effective prioritization metric — debt items that cause daily developer friction should be fixed before items with only occasional impact",
      "Brooks's Law of System Entropy: As described in The Mythical Man-Month, without active maintenance, software systems tend toward increasing disorder and reduced conceptual integrity"
    ],
    "core_concepts_zh": [
      "债务即金融隐喻：技术债务会累积「利息」——存在越久，通过降低开发速度、增加事故和加大入职难度施加的持续成本就越高",
      "Fowler 象限：按两个轴对债务分类——有意/无意和鲁莽/审慎——帮助团队区分战略性捷径和粗心代码",
      "容量分配：可持续的团队在每个迭代中保留固定百分比用于债务削减，而非偶尔的「技术债务冲刺」",
      "延迟成本：最有效的优先排序指标——每天造成开发者摩擦的债务项应优先于仅偶尔产生影响的项目",
      "布鲁克斯的系统熵增定律：如《人月神话》所述，没有积极维护，软件系统会趋向日益增加的无序和降低的概念完整性"
    ],
    "timeline": [
      [
        "1975",
        "Fred Brooks describes system entropy and the tendency of software to degrade over time in The Mythical Man-Month"
      ],
      [
        "1992",
        "Ward Cunningham coins the 'technical debt' metaphor at OOPSLA to explain why refactoring is an investment, not a cost"
      ],
      [
        "2009",
        "Martin Fowler publishes the Technical Debt Quadrant, providing a framework for categorizing different types of debt"
      ],
      [
        "2018",
        "Accelerate research demonstrates that teams with low technical debt deploy 46x more frequently with 7x lower change failure rate"
      ],
      [
        "2023",
        "AI-assisted debt identification tools (SonarQube AI, CodeScene) automate detection and prioritization of technical debt"
      ]
    ],
    "timeline_zh": [
      [
        "1975",
        "Fred Brooks 在《人月神话》中描述系统熵增和软件随时间退化的趋势"
      ],
      [
        "1992",
        "Ward Cunningham 在 OOPSLA 上创造「技术债务」隐喻，解释为什么重构是投资而非成本"
      ],
      [
        "2009",
        "Martin Fowler 发表技术债务象限，提供对不同类型债务进行分类的框架"
      ],
      [
        "2018",
        "《加速》的研究表明低技术债务的团队部署频率高 46 倍，变更失败率低 7 倍"
      ],
      [
        "2023",
        "AI 辅助的债务识别工具（SonarQube AI、CodeScene）自动化技术债务的检测和优先排序"
      ]
    ],
    "dos": [
      "Make tech debt visible to product stakeholders using business impact metrics (incidents caused, features delayed), not just engineering jargon",
      "Allocate a consistent 15-20% of sprint capacity to debt reduction every iteration, rather than scheduling sporadic 'debt sprints'",
      "Tie debt remediation to incidents — every postmortem should identify whether technical debt was a contributing factor",
      "Track debt trends quarterly to demonstrate whether the organization is gaining or losing ground"
    ],
    "dos_zh": [
      "使用业务影响指标（导致的事故、延迟的功能）而非纯工程术语让产品利益相关者看到技术债务",
      "每个迭代固定分配 15-20% 的冲刺容量用于债务削减，而非安排零散的「债务冲刺」",
      "将债务修复与事故关联——每次事后复盘都应识别技术债务是否为促成因素",
      "每季度跟踪债务趋势，展示组织是在缩小还是扩大债务差距"
    ],
    "donts": [
      "Don't treat all technical debt as equally urgent — reckless debt and prudent debt require different responses",
      "Don't promise zero tech debt — some debt is a deliberate, rational tradeoff for speed to market",
      "Don't let tech debt become a catch-all label — vague items like 'refactor everything' are not actionable debt items",
      "Don't schedule debt reduction only when there is 'spare time' — it will never happen because spare time does not exist"
    ],
    "donts_zh": [
      "不要将所有技术债务视为同等紧急——鲁莽的债务和审慎的债务需要不同的应对",
      "不要承诺零技术债务——某些债务是为了加快上市速度而做出的审慎合理权衡",
      "不要让技术债务成为万能标签——「重构所有东西」这类模糊条目不是可执行的债务项",
      "不要只在有「空闲时间」时才安排债务削减——空闲时间不存在，所以永远不会发生"
    ],
    "case_study_company": "Spotify",
    "case_study": "Spotify implemented a systematic tech debt management approach in 2017 after teams reported that accumulated debt was slowing feature delivery by an estimated 25%. They introduced a 'Tech Health' scoring system where each squad assessed their codebase on a traffic-light scale (green/yellow/red) across dimensions like test coverage, deployment ease, and code clarity. Red scores triggered automatic allocation of 20% sprint capacity to remediation. Within 18 months, the percentage of red-scored components dropped from 34% to 12%, and the Accelerate metrics (deploy frequency and lead time) improved by 40% across the organization. The approach was cited in internal talks as making the case for sustained platform investment.",
    "case_study_zh": "Spotify 在 2017 年实施了系统化的技术债务管理方法，此前团队报告累积的债务使功能交付速度降低了约 25%。他们引入了「技术健康」评分系统，每个小队在测试覆盖率、部署便利性和代码清晰度等维度上使用交通灯标度（绿/黄/红）评估其代码库。红色评分自动触发 20% 冲刺容量用于修复。在 18 个月内，红色评分组件的比例从 34% 降至 12%，整个组织的加速指标（部署频率和前置时间）提升了 40%。该方法在内部分享中被引用为支持持续平台投资的有力依据。",
    "when_not_to_use": [
      "Pre-product-market-fit startups where the codebase may be thrown away — speed to learning matters more than code quality",
      "One-off projects or prototypes with a defined end date where long-term maintenance is not a concern",
      "Teams already practicing continuous refactoring (e.g., TDD with aggressive refactoring) where debt rarely accumulates",
      "Situations where the system is scheduled for decommission — paying down debt on a system being replaced is wasteful"
    ],
    "when_not_to_use_zh": [
      "尚未找到产品市场契合的创业公司，代码库可能被丢弃——学习速度比代码质量更重要",
      "有明确结束日期的一次性项目或原型，长期维护不是关切",
      "已在实践持续重构（如 TDD 配合积极重构）的团队，债务很少累积",
      "系统已计划退役——为即将被替换的系统偿还债务是浪费"
    ],
    "adopters": [
      "Spotify",
      "Google",
      "Shopify",
      "Atlassian",
      "ThoughtWorks"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Cunningham, W. (1992). \"The WyCash Portfolio Management System\". OOPSLA Experience Report.",
    "secondary_sources": [
      "Fowler, M. (2009). \"Technical Debt Quadrant\". martinfowler.com.",
      "Forsgren, N., Humble, J. & Kim, G. (2018). \"Accelerate: The Science of Lean Software and DevOps\". IT Revolution Press.",
      "Kruchten, P., Nord, R.L. & Ozkaya, I. (2019). \"Managing Technical Debt: Reducing Friction in Software Development\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "technical-debt-quadrant",
        "type": "complement"
      },
      {
        "slug": "blameless-postmortems",
        "type": "complement"
      },
      {
        "slug": "architecture-review-board",
        "type": "complement"
      },
      {
        "slug": "continuous-architecture",
        "type": "complement"
      }
    ]
  },
  {
    "id": 149,
    "name": "Developer Experience (DevEx) Framework",
    "name_zh": "开发者体验框架",
    "slug": "developer-experience-framework",
    "category": "team",
    "desc": "A structured approach to measuring and improving the three dimensions of developer experience: feedback loops, cognitive load, and flow state",
    "desc_zh": "结构化的方法来衡量和改善开发者体验的三个维度：反馈循环、认知负荷和心流状态",
    "steps": [
      "Measure the three DevEx dimensions: feedback loop speed, cognitive load, and flow state frequency through developer surveys and system telemetry",
      "Identify the highest-friction workflows: slow CI builds, complex environment setup, fragmented documentation",
      "Prioritize improvements using impact-effort analysis — target changes that improve many developers' daily experience",
      "Implement improvements iteratively: faster builds, better onboarding, improved tooling, streamlined processes",
      "Track DevEx metrics quarterly using both perceptual (survey) and behavioral (telemetry) data to measure progress"
    ],
    "steps_zh": [
      "通过开发者调查和系统遥测衡量 DevEx 三个维度：反馈循环速度、认知负荷和心流状态频率",
      "识别摩擦最大的工作流：CI 构建缓慢、环境搭建复杂、文档碎片化",
      "使用影响-努力分析优先排序改进——瞄准改善众多开发者日常体验的变更",
      "迭代实施改进：更快的构建、更好的入职体验、改进的工具、精简的流程",
      "每季度使用感知数据（调查）和行为数据（遥测）跟踪 DevEx 指标以衡量进展"
    ],
    "ai_relevant": true,
    "viz_type": "radar",
    "viz_labels": [
      "Feedback Loop",
      "Cognitive Load",
      "Flow State",
      "Friction Workflows",
      "DevEx Metrics"
    ],
    "viz_labels_zh": [
      "反馈循环",
      "认知负荷",
      "心流状态",
      "高摩擦工作流",
      "DevEx指标"
    ],
    "related": [
      "platform-engineering",
      "engineering-ladder",
      "technical-debt-management-framework",
      "team-topologies"
    ],
    "tags": [
      "developer-experience",
      "productivity",
      "cognitive-load",
      "flow-state",
      "feedback-loops"
    ],
    "origin_author": "Nicole Forsgren, Margaret-Anne Storey & Chandra Maddila, 2023",
    "origin_source": "DevEx: What Actually Drives Productivity (Forsgren, Storey & Maddila, ACM Queue, 2023); builds on Accelerate (Forsgren, Humble & Kim, 2018) and SPACE framework (2021)",
    "origin_source_zh": "《DevEx：究竟什么驱动生产力》（Forsgren、Storey & Maddila，ACM Queue，2023）；建立在《加速》（Forsgren、Humble & Kim，2018）和 SPACE 框架（2021）之上",
    "complexity": "intermediate",
    "when_to_use": [
      "Organizations where developer satisfaction surveys indicate frustration with tooling, processes, or build times",
      "Teams experiencing declining velocity despite stable headcount, suggesting friction rather than capacity is the issue",
      "Companies competing for engineering talent where developer experience is a key retention and recruitment differentiator",
      "Platform engineering teams needing a framework to prioritize which developer pain points to address first"
    ],
    "when_to_use_zh": [
      "开发者满意度调查显示对工具、流程或构建时间不满的组织",
      "在人员稳定的情况下速度下降的团队，表明摩擦而非产能才是问题",
      "争夺工程人才的公司，开发者体验是留存和招聘的关键差异化因素",
      "平台工程团队需要框架来优先排序应先解决哪些开发者痛点"
    ],
    "core_concepts": [
      "Feedback Loops: The speed at which developers get responses from tools, code reviews, CI/CD, and tests — faster feedback reduces context switching and frustration",
      "Cognitive Load: The amount of mental processing required to complete tasks — excessive load from poor documentation, complex systems, or unclear ownership degrades productivity",
      "Flow State: The ability to enter and sustain deep, focused work — interruptions, slow tools, and unnecessary meetings are the primary enemies of flow",
      "Perceptual + Behavioral Measurement: Effective DevEx measurement combines developer surveys (perceptual) with system telemetry (behavioral), because neither alone tells the full story",
      "Developer Productivity ≠ Output: The framework explicitly rejects lines-of-code or PR-count as productivity measures, focusing instead on the conditions that enable good work"
    ],
    "core_concepts_zh": [
      "反馈循环：开发者从工具、代码评审、CI/CD 和测试中获得响应的速度——更快的反馈减少上下文切换和挫败感",
      "认知负荷：完成任务所需的心智处理量——由文档差、系统复杂或职责不清导致的过度负荷会降低生产力",
      "心流状态：进入和维持深度专注工作的能力——中断、慢工具和不必要的会议是心流的主要敌人",
      "感知+行为双重衡量：有效的 DevEx 衡量结合开发者调查（感知）和系统遥测（行为），因为单独任何一种都无法讲述完整故事",
      "开发者生产力≠产出：该框架明确拒绝将代码行数或 PR 数量作为生产力衡量标准，转而关注支撑优质工作的条件"
    ],
    "timeline": [
      [
        "2018",
        "Forsgren, Humble and Kim publish Accelerate, establishing DORA metrics as the standard for measuring software delivery performance"
      ],
      [
        "2021",
        "The SPACE framework (Forsgren et al.) expands beyond DORA to include satisfaction, well-being, and collaboration dimensions"
      ],
      [
        "2023",
        "Forsgren, Storey and Maddila publish the DevEx framework in ACM Queue, distilling developer productivity into three actionable dimensions"
      ],
      [
        "2024",
        "Major tech companies (LinkedIn, Spotify, Uber) publicly adopt DevEx frameworks and share their measurement approaches"
      ],
      [
        "2025",
        "AI-powered DevEx tools emerge: predictive build optimization, automated toil detection, and intelligent code review routing"
      ]
    ],
    "timeline_zh": [
      [
        "2018",
        "Forsgren、Humble 和 Kim 发表《加速》，将 DORA 指标确立为衡量软件交付绩效的标准"
      ],
      [
        "2021",
        "SPACE 框架（Forsgren 等）超越 DORA，纳入满意度、幸福感和协作维度"
      ],
      [
        "2023",
        "Forsgren、Storey 和 Maddila 在 ACM Queue 发表 DevEx 框架，将开发者生产力提炼为三个可执行维度"
      ],
      [
        "2024",
        "主要科技公司（LinkedIn、Spotify、Uber）公开采纳 DevEx 框架并分享其衡量方法"
      ],
      [
        "2025",
        "AI 驱动的 DevEx 工具涌现：预测性构建优化、自动化苦差事检测和智能代码评审分配"
      ]
    ],
    "dos": [
      "Measure all three dimensions (feedback loops, cognitive load, flow state) — optimizing one while ignoring others gives an incomplete picture",
      "Combine survey data with system telemetry: a developer's perception of build speed may differ from actual build times, and both matter",
      "Start with the most universally painful friction point — usually CI/CD speed or environment setup — for quick, visible wins",
      "Share DevEx metrics transparently with engineering leadership to justify investment in developer tooling and platforms"
    ],
    "dos_zh": [
      "衡量所有三个维度（反馈循环、认知负荷、心流状态）——优化一个而忽略其他会给出不完整的画面",
      "将调查数据与系统遥测结合：开发者对构建速度的感知可能与实际构建时间不同，两者都重要",
      "从最普遍的痛点开始——通常是 CI/CD 速度或环境搭建——以获得快速、可见的成效",
      "透明地与工程领导层分享 DevEx 指标，为开发者工具和平台的投资提供依据"
    ],
    "donts": [
      "Don't use DevEx metrics as individual performance measures — they are organizational health indicators, not employee scorecards",
      "Don't rely solely on surveys without telemetry, or vice versa — developers may not realize their builds are slow if they have always been slow",
      "Don't over-optimize for one team's workflow at the expense of others — DevEx improvements should benefit the broadest developer population",
      "Don't ignore the social dimension: code review responsiveness, meeting load, and on-call burden are as much DevEx issues as tooling speed"
    ],
    "donts_zh": [
      "不要将 DevEx 指标用作个人绩效衡量——它们是组织健康指标，而非员工记分卡",
      "不要仅依赖调查而无遥测数据，反之亦然——如果构建一直很慢，开发者可能意识不到它们很慢",
      "不要以牺牲其他团队为代价过度优化某个团队的工作流——DevEx 改进应惠及最广泛的开发者群体",
      "不要忽视社交维度：代码评审响应速度、会议负担和值班负担同样是 DevEx 问题，不仅仅是工具速度"
    ],
    "case_study_company": "LinkedIn",
    "case_study": "LinkedIn established a dedicated Developer Experience team in 2021 after internal surveys revealed that engineers spent an average of 3.5 hours per week waiting on builds and dealing with flaky tests. Using the DevEx framework's three dimensions, they prioritized feedback loop speed first. They invested in remote build caching (reducing average build times from 12 minutes to 3 minutes), automatic flaky test quarantining, and a unified developer portal. Within a year, developer satisfaction scores improved by 28%, deploy frequency increased by 35%, and the estimated productivity savings equaled the output of 200 additional engineers — all without hiring. The success led LinkedIn to double their platform engineering investment in 2023.",
    "case_study_zh": "LinkedIn 在 2021 年建立了专门的开发者体验团队，此前内部调查显示工程师平均每周花 3.5 小时等待构建和处理不稳定测试。使用 DevEx 框架的三个维度，他们优先关注反馈循环速度。他们投资了远程构建缓存（将平均构建时间从 12 分钟缩短至 3 分钟）、自动隔离不稳定测试和统一开发者门户。一年内，开发者满意度分数提升了 28%，部署频率增加了 35%，估计的生产力节省相当于增加了 200 名工程师的产出——全部无需招聘。这一成功促使 LinkedIn 在 2023 年将平台工程投资翻倍。",
    "when_not_to_use": [
      "Very small teams (under 10 engineers) where informal communication naturally surfaces and resolves friction",
      "Organizations without the capacity to act on DevEx findings — measuring without improving breeds frustration",
      "Contexts where the primary bottleneck is product direction, not engineering friction — DevEx cannot fix a strategy problem",
      "Teams in crisis mode fighting critical production issues — stabilize operations before optimizing developer experience"
    ],
    "when_not_to_use_zh": [
      "非常小的团队（10 名工程师以下），非正式沟通自然能发现和解决摩擦",
      "没有能力根据 DevEx 发现采取行动的组织——只衡量不改进会滋生挫败感",
      "主要瓶颈是产品方向而非工程摩擦的场景——DevEx 无法修复战略问题",
      "正在处理关键生产问题的危机模式团队——先稳定运营再优化开发者体验"
    ],
    "adopters": [
      "LinkedIn",
      "Spotify",
      "Uber",
      "Microsoft",
      "Shopify"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "usability",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Forsgren, N., Storey, M.-A. & Maddila, C. (2023). \"DevEx: What Actually Drives Productivity\". ACM Queue, 21(2).",
    "secondary_sources": [
      "Forsgren, N., Humble, J. & Kim, G. (2018). \"Accelerate: The Science of Lean Software and DevOps\". IT Revolution Press.",
      "Storey, M.-A. et al. (2021). \"The SPACE of Developer Productivity\". ACM Queue, 19(1)."
    ],
    "typed_relations": [
      {
        "slug": "platform-engineering",
        "type": "complement"
      },
      {
        "slug": "engineering-ladder",
        "type": "complement"
      },
      {
        "slug": "technical-debt-management-framework",
        "type": "complement"
      },
      {
        "slug": "team-topologies",
        "type": "complement"
      }
    ]
  },
  {
    "id": 150,
    "name": "Mob/Ensemble Programming",
    "name_zh": "群体编程 / 集合编程",
    "slug": "mob-ensemble-programming",
    "category": "team",
    "desc": "The whole team works together on one task at one computer, with a rotating driver and navigators providing real-time collaboration",
    "desc_zh": "整个团队在一台电脑上共同完成一项任务，轮换驾驶员和领航员进行实时协作",
    "steps": [
      "Gather the full team (3-6 people) at one workstation with a large shared display or screen-sharing tool",
      "Designate one person as the Driver (types code) and everyone else as Navigators (direct the approach and design)",
      "Rotate the Driver role on a fixed timer (typically every 10-15 minutes) to maintain engagement and shared ownership",
      "Navigators discuss design, catch errors, and guide the Driver at the highest level of abstraction they can follow",
      "Use retrospectives after each session to adjust rotation intervals, team composition, and facilitation practices"
    ],
    "steps_zh": [
      "将整个团队（3-6 人）聚集在一个工作站前，使用大型共享显示器或屏幕共享工具",
      "指定一人为驾驶员（键入代码），其他人为领航员（指导方法和设计）",
      "按固定计时器（通常每 10-15 分钟）轮换驾驶员角色，以保持参与度和共同所有权",
      "领航员讨论设计、发现错误，并在驾驶员能跟上的最高抽象层面进行指导",
      "每次会议后使用回顾来调整轮换间隔、团队组成和引导实践"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Driver",
      "Navigators",
      "Rotation Timer",
      "Design Discussion",
      "Retrospective"
    ],
    "viz_labels_zh": [
      "驾驶员",
      "导航员",
      "轮换计时",
      "设计讨论",
      "回顾"
    ],
    "related": [
      "team-topologies",
      "blameless-postmortems",
      "spotify-model",
      "engineering-ladder"
    ],
    "tags": [
      "collaboration",
      "pair-programming",
      "mob-programming",
      "ensemble",
      "knowledge-sharing"
    ],
    "origin_author": "Woody Zuill, ~2012",
    "origin_source": "Mob Programming: A Whole Team Approach (Woody Zuill, 2014 Agile conference); informed by The Mythical Man-Month (Brooks, 1975) communication models",
    "origin_source_zh": "《群体编程：整个团队的方法》（Woody Zuill，2014 敏捷大会）；受《人月神话》（Brooks，1975）的沟通模型启发",
    "complexity": "beginner",
    "when_to_use": [
      "Complex, ambiguous problems where diverse perspectives significantly improve the solution quality",
      "Onboarding new team members who need to absorb team practices, codebase knowledge, and domain context quickly",
      "Critical path work where defects would be very costly and real-time review provides higher quality than async code review",
      "Teams with knowledge silos where expertise is concentrated in one or two people and needs to be distributed"
    ],
    "when_to_use_zh": [
      "复杂模糊的问题，多样化视角能显著提高解决方案质量",
      "新成员入职，需要快速吸收团队实践、代码库知识和领域上下文",
      "关键路径工作，缺陷成本极高，实时评审比异步代码评审提供更高质量",
      "存在知识孤岛的团队，专业知识集中在一两个人身上需要分散"
    ],
    "core_concepts": [
      "Driver-Navigator pattern: The Driver operates the keyboard but does not make design decisions — Navigators provide the thinking while the Driver translates it to code",
      "Strong-style pairing: 'For an idea to go from your head into the computer, it must go through someone else's hands' — Llewellyn Falco's principle that ensures knowledge transfer",
      "Whole-team ownership: Because everyone participates in writing every line of code, there is no individual code ownership and no knowledge silos",
      "Continuous code review: With multiple Navigators watching every keystroke, defects are caught in real-time rather than in post-hoc reviews",
      "Brooks's communication insight: Mob programming inverts Brooks's Law — instead of n*(n-1)/2 communication channels creating overhead, the single shared context eliminates miscommunication"
    ],
    "core_concepts_zh": [
      "驾驶员-领航员模式：驾驶员操作键盘但不做设计决策——领航员提供思路，驾驶员将其转化为代码",
      "强风格结对：「一个想法要从你的脑子进入电脑，必须经过另一个人的手」——Llewellyn Falco 的原则确保知识传递",
      "全团队所有权：因为每个人都参与编写每一行代码，不存在个人代码所有权和知识孤岛",
      "持续代码评审：多位领航员观察每个按键，缺陷在实时中被捕获，而非事后评审",
      "布鲁克斯的沟通洞察：群体编程反转了布鲁克斯定律——n*(n-1)/2 条通信通道带来的开销被单一共享上下文所消除，杜绝了沟通不畅"
    ],
    "timeline": [
      [
        "1975",
        "Fred Brooks identifies communication overhead as the dominant cost of adding people to a team in The Mythical Man-Month"
      ],
      [
        "1999",
        "Kent Beck formalizes pair programming as a core practice of Extreme Programming"
      ],
      [
        "2012",
        "Woody Zuill discovers mob programming at Hunter Industries when his team naturally starts working together at one screen"
      ],
      [
        "2014",
        "Zuill presents 'Mob Programming: A Whole Team Approach' at the Agile conference, introducing the practice to the wider community"
      ],
      [
        "2020",
        "Remote mob/ensemble programming surges during COVID-19, enabled by tools like VS Code Live Share, Tuple, and mob.sh"
      ]
    ],
    "timeline_zh": [
      [
        "1975",
        "Fred Brooks 在《人月神话》中将沟通开销确定为向团队添加人员的主要成本"
      ],
      [
        "1999",
        "Kent Beck 将结对编程正式确立为极限编程的核心实践"
      ],
      [
        "2012",
        "Woody Zuill 在 Hunter Industries 发现群体编程，当时他的团队自然而然地开始在一个屏幕前一起工作"
      ],
      [
        "2014",
        "Zuill 在敏捷大会上发表《群体编程：整个团队的方法》，将该实践介绍给更广泛的社区"
      ],
      [
        "2020",
        "远程群体/集合编程在 COVID-19 期间激增，得益于 VS Code Live Share、Tuple 和 mob.sh 等工具"
      ]
    ],
    "dos": [
      "Keep rotation intervals short (10-15 minutes) to maintain energy and prevent the Driver from zoning out",
      "Start with well-defined, time-boxed sessions (2-3 hours) rather than all-day mob programming to avoid fatigue",
      "Use mob programming selectively for high-value work: complex design, critical bugs, or knowledge transfer — not routine tasks",
      "Create psychological safety so junior team members feel comfortable navigating and voicing ideas to senior colleagues"
    ],
    "dos_zh": [
      "保持短轮换间隔（10-15 分钟）以维持精力，防止驾驶员走神",
      "从定义明确的限时会议（2-3 小时）开始，而非全天群体编程，以避免疲劳",
      "选择性地将群体编程用于高价值工作：复杂设计、关键 Bug 或知识传递——而非日常任务",
      "营造心理安全感，使初级团队成员在面对资深同事时也能自如地领航和表达想法"
    ],
    "donts": [
      "Don't use mob programming for simple, well-understood tasks where one person would be equally effective — it is wasteful for routine work",
      "Don't let one dominant personality monopolize navigation — the facilitator must ensure all voices are heard",
      "Don't skip the Driver rotation — without rotation, mob programming degrades into one person coding while others watch passively",
      "Don't force mob programming on unwilling teams — the practice requires buy-in and psychological safety to be effective"
    ],
    "donts_zh": [
      "不要将群体编程用于一个人即可同样有效完成的简单、明确的任务——对日常工作来说是浪费",
      "不要让一个强势的人垄断领航——引导者必须确保所有声音都被听到",
      "不要跳过驾驶员轮换——没有轮换，群体编程会退化为一个人编码而其他人被动旁观",
      "不要强迫不愿意的团队进行群体编程——该实践需要认同和心理安全感才能有效"
    ],
    "case_study_company": "Hunter Industries",
    "case_study": "Hunter Industries, a mid-sized irrigation equipment manufacturer in San Marcos, California, is where mob programming originated. In 2012, Woody Zuill's development team of 7 people began working together at a single workstation after a 'code retreat' exercise. Rather than returning to individual work, they continued the practice full-time. Over the next two years, the team reported a dramatic reduction in defects (virtually zero production bugs), elimination of code review bottlenecks, and a 10x improvement in onboarding speed for new developers. Brooks's predicted communication overhead paradoxically decreased because the shared context eliminated the need for status meetings, handoff documentation, and async coordination. The practice spread industry-wide after Zuill's 2014 Agile conference presentation.",
    "case_study_zh": "Hunter Industries 是位于加州圣马科斯的中型灌溉设备制造商，是群体编程的发源地。2012 年，Woody Zuill 的 7 人开发团队在一次「代码静修」练习后开始在一个工作站上一起工作。他们没有回到各自独立工作，而是继续全职实践。在接下来的两年中，团队报告缺陷大幅减少（生产 Bug 几乎为零）、消除了代码评审瓶颈，新开发者入职速度提高了 10 倍。布鲁克斯预测的沟通开销反而降低了，因为共享上下文消除了对状态会议、交接文档和异步协调的需要。在 Zuill 2014 年敏捷大会的演讲之后，该实践在业界广泛传播。",
    "when_not_to_use": [
      "Routine, well-understood tasks where individual work is more efficient and mob programming adds overhead without quality benefit",
      "Teams larger than 6 people — beyond this size, navigators cannot all contribute meaningfully and engagement drops",
      "Remote teams without reliable, low-latency screen-sharing tools — network lag makes the Driver experience frustrating",
      "Highly independent tasks that do not benefit from real-time collaboration (e.g., documentation, data entry, configuration)"
    ],
    "when_not_to_use_zh": [
      "日常的、明确的任务，个人工作更高效，群体编程增加开销但无质量收益",
      "超过 6 人的团队——超过此规模，领航员无法都有意义地贡献，参与度下降",
      "没有可靠低延迟屏幕共享工具的远程团队——网络延迟使驾驶员体验令人沮丧",
      "不受益于实时协作的高度独立任务（如文档编写、数据录入、配置）"
    ],
    "adopters": [
      "Hunter Industries",
      "Cucumber",
      "Shopify",
      "Unruly Media",
      "LEGO"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability",
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Zuill, W. & Meadows, K. (2016). \"Mob Programming: A Whole Team Approach\". woody-zuill.com.",
    "secondary_sources": [
      "Zuill, W. (2014). \"Mob Programming — A Whole Team Approach\". Proceedings of Agile 2014.",
      "Wilson, E. (2023). \"Ensemble Programming Guidebook\". ensembleprogramming.xyz."
    ],
    "typed_relations": [
      {
        "slug": "team-topologies",
        "type": "complement"
      },
      {
        "slug": "blameless-postmortems",
        "type": "complement"
      },
      {
        "slug": "spotify-model",
        "type": "complement"
      },
      {
        "slug": "engineering-ladder",
        "type": "complement"
      }
    ]
  },
  {
    "id": 215,
    "name": "RFC Process",
    "name_zh": "RFC流程",
    "slug": "rfc-process",
    "category": "team",
    "desc": "Lightweight request-for-comments process for transparent technical decision-making",
    "desc_zh": "用于透明技术决策的轻量级意见征集流程",
    "steps": [
      "Author drafts an RFC document using a standard template (motivation, proposed solution, alternatives considered, drawbacks, unresolved questions) and opens it as a pull request or shared document",
      "The RFC enters a comment period (typically 1-4 weeks) during which any stakeholder can read, comment, ask questions, or propose amendments to the draft",
      "The author revises the RFC based on feedback, addressing objections and updating the alternatives and unresolved questions sections",
      "A designated shepherd (tech lead, architect, or committee) evaluates the feedback and makes a final decision: accept, reject, or send back for revision with specific changes required",
      "Accepted RFCs are merged into the RFC repository and serve as the authoritative record of the decision; rejected RFCs are closed with a documented rationale"
    ],
    "steps_zh": [
      "作者使用标准模板（动机、提议的解决方案、已考虑的替代方案、缺点、未解决的问题）起草RFC文档，并作为拉取请求或共享文档打开",
      "RFC进入评论期（通常为1-4周），在此期间任何利益相关方都可以阅读、评论、提问或提议修改草案",
      "作者根据反馈修改RFC，解决异议并更新替代方案和未解决问题部分",
      "指定的推进者（技术负责人、架构师或委员会）评估反馈并做出最终决定：接受、拒绝或要求特定修改后重新提交",
      "被接受的RFC合并到RFC仓库，作为决策的权威记录；被拒绝的RFC以记录的理由关闭"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "RFC Draft",
      "Comment Period",
      "Author Revision",
      "Shepherd Decision",
      "Merge Archive"
    ],
    "viz_labels_zh": [
      "RFC草稿",
      "评论期",
      "作者修订",
      "决策人",
      "归档"
    ],
    "related": [
      "architecture-review-board",
      "engineering-ladder"
    ],
    "tags": [
      "rfc",
      "decision-making",
      "process",
      "governance",
      "documentation"
    ],
    "origin_author": "Popularized by Rust language team (2014) and React team (2017); inspired by IETF RFC process (1969)",
    "origin_source": "Rust RFC Book (github.com/rust-lang/rfcs); React RFC process (github.com/reactjs/rfcs); Kleppmann, M. (2017). \"Designing Data-Intensive Applications\", Ch. 1.",
    "origin_source_zh": "Rust RFC手册（github.com/rust-lang/rfcs）；React RFC流程（github.com/reactjs/rfcs）；Kleppmann, M.（2017）「数据密集型应用系统设计」，第1章",
    "complexity": "beginner",
    "when_to_use": [
      "When a technical decision will affect multiple teams or cannot be easily reversed once implemented",
      "When the team needs a structured way to gather diverse perspectives before committing to an architectural direction",
      "When junior engineers need a safe, asynchronous way to propose improvements without requiring real-time advocacy skills",
      "When the organization has had recurring problems with decisions made in small groups without broader input leading to downstream rework"
    ],
    "when_to_use_zh": [
      "当技术决策将影响多个团队或一旦实施就难以逆转时",
      "当团队需要一种结构化方式在承诺某个架构方向之前收集多样化观点时",
      "当初级工程师需要一种安全、异步的方式来提议改进，而不需要实时倡导技能时",
      "当组织在没有更广泛输入的小组决策导致后期返工方面有反复问题时"
    ],
    "core_concepts": [
      "Asynchronous Decision-Making: RFC enables thoughtful deliberation across time zones and schedules without requiring synchronous meetings for every technical choice",
      "Written Culture: Decisions are made through written argument rather than verbal persuasion, which benefits introverts, non-native speakers, and remote team members equally",
      "Decision Record: The final RFC serves as a durable, searchable record of why a decision was made, preventing the 'why did we do it this way?' problem months later",
      "Shepherd Model: A designated individual or small committee ensures the RFC reaches a decision rather than dying in an infinite comment cycle",
      "Alternatives Considered: The RFC format requires documenting rejected alternatives with reasons, preventing the same rejected ideas from being re-proposed without new information"
    ],
    "core_concepts_zh": [
      "异步决策制定：RFC使得跨时区和日程的深思熟虑审议成为可能，而不需要为每个技术选择举行同步会议",
      "书面文化：决策通过书面论证而非口头说服做出，这对内向者、非母语使用者和远程团队成员同等有益",
      "决策记录：最终的RFC作为持久、可搜索的记录，说明为什么做出某个决定，防止数月后出现「为什么我们这样做？」的问题",
      "推进者模式：指定的个人或小型委员会确保RFC达成决定，而不是在无限评论循环中消亡",
      "已考虑的替代方案：RFC格式要求记录被拒绝的替代方案及原因，防止相同的被拒绝想法在没有新信息的情况下被重新提议"
    ],
    "timeline": [
      [
        "1969",
        "IETF publishes its first Request for Comments (RFC 1 by Steve Crocker), establishing the collaborative technical discussion format"
      ],
      [
        "2014",
        "Rust language team adopts a formal RFC process for all language changes, setting the template for engineering team RFCs"
      ],
      [
        "2017",
        "React team open-sources its RFC process for major API changes, popularizing lightweight RFCs in product engineering teams"
      ],
      [
        "2019",
        "Basecamp, GitHub, and Shopify publish their internal RFC/ADR templates, spurring adoption across the industry"
      ],
      [
        "2022",
        "RFC tools (Linear, Notion, Confluence RFC templates) make structured decision-making accessible to non-developer teams"
      ]
    ],
    "timeline_zh": [
      [
        "1969",
        "IETF发布其第一个意见征集文档（Steve Crocker的RFC 1），建立了协作技术讨论格式"
      ],
      [
        "2014",
        "Rust语言团队为所有语言变更采用正式的RFC流程，为工程团队RFC设立了模板"
      ],
      [
        "2017",
        "React团队开源其重大API变更的RFC流程，使轻量级RFC在产品工程团队中流行"
      ],
      [
        "2019",
        "Basecamp、GitHub和Shopify发布其内部RFC/ADR模板，推动了整个行业的采用"
      ],
      [
        "2022",
        "RFC工具（Linear、Notion、Confluence RFC模板）使非开发者团队也能进行结构化决策"
      ]
    ],
    "dos": [
      "Do keep RFC documents short (1-3 pages) because long documents are not read; extract detail into appendices or linked documents",
      "Do set a clear comment deadline so that stakeholders know when to engage and the RFC doesn't linger in review indefinitely",
      "Do require authors to document rejected alternatives with specific reasons because it prevents re-hashing of already-explored options in future discussions",
      "Do store merged RFCs in a searchable repository (Git, Confluence, Notion) so that future engineers can understand the reasoning behind current decisions"
    ],
    "dos_zh": [
      "保持RFC文档简短（1-3页），因为长文档不会被阅读；将细节提取到附录或链接文档中",
      "设置明确的评论截止日期，使利益相关方知道何时参与，RFC不会无限期停留在评审中",
      "要求作者记录被拒绝的替代方案及具体原因，防止在未来讨论中重复已探索过的选项",
      "将合并的RFC存储在可搜索的仓库（Git、Confluence、Notion）中，使未来的工程师能够理解当前决策背后的原因"
    ],
    "donts": [
      "Don't require RFCs for every decision because the overhead will cause engineers to bypass the process for non-trivial choices to avoid bureaucracy",
      "Don't let RFCs become approval theater where the decision is made before the RFC is written and comments are ignored",
      "Don't allow RFC comment periods to drag on indefinitely without a shepherd closing the discussion because stalled RFCs create organizational paralysis",
      "Don't make RFC authorship gatekept to senior engineers because it defeats the purpose of creating a democratic, inclusive decision process"
    ],
    "donts_zh": [
      "不要对每个决策都要求RFC，因为开销会导致工程师为了避免官僚程序而对非平凡的选择绕过该流程",
      "不要让RFC成为审批表演，即在编写RFC之前就已做出决定，评论被忽视",
      "不要让RFC评论期在没有推进者结束讨论的情况下无限期拖延，因为停滞的RFC会造成组织瘫痪",
      "不要将RFC作者资格限制给高级工程师，因为这违背了创建民主、包容决策流程的目的"
    ],
    "case_study_company": "Rust Language Team",
    "case_study": "The Rust programming language team has used an RFC process since 2014 for all significant language, library, and tooling changes. Every feature in Rust — from async/await to the ownership model refinements — was first proposed, debated, and approved through a public RFC on GitHub. The process has enabled a globally distributed community of contributors to make high-quality decisions without centralized authority. As of 2024, the Rust RFC repository contains over 3,000 RFCs spanning 10 years of language evolution, serving as an unparalleled historical record of why Rust is designed the way it is.",
    "case_study_zh": "Rust编程语言团队自2014年以来对所有重要的语言、库和工具变更使用RFC流程。Rust中的每个特性——从async/await到所有权模型改进——都首先通过GitHub上的公开RFC提议、讨论和批准。该流程使全球分布式贡献者社区能够在没有集中权威的情况下做出高质量决策。截至2024年，Rust RFC仓库包含超过3000个RFC，跨越10年的语言演进，作为Rust为何如此设计的无与伦比的历史记录。",
    "when_not_to_use": [
      "Urgent production incidents where decisions must be made in minutes, not weeks, and the RFC process creates dangerous delay",
      "Small decisions (library patch version bumps, variable naming, minor refactors) where the RFC overhead exceeds the decision complexity",
      "Exploration and research phases where the goal is to learn, not commit, and forcing premature decisions stifles experimentation",
      "Organizations with fewer than 5 engineers where synchronous conversation is more efficient than asynchronous written deliberation"
    ],
    "when_not_to_use_zh": [
      "紧急生产事故，决策必须在分钟而非数周内做出，RFC流程造成危险延迟",
      "小型决策（库补丁版本升级、变量命名、小型重构），RFC开销超过决策复杂度",
      "探索和研究阶段，目标是学习而非承诺，强制提前决策会扼制实验",
      "少于5名工程师的组织，同步对话比异步书面审议更高效"
    ],
    "adopters": [
      "Rust Foundation",
      "React / Meta",
      "Ember.js",
      "Swift (Apple)",
      "Shopify"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Crocker, S. (1969). \"RFC 1: Host Software\". IETF. / Rust Team (2014). \"RFC 0002: RFC Process\". github.com/rust-lang/rfcs.",
    "secondary_sources": [
      "React Team (2017). \"React RFC Process\". github.com/reactjs/rfcs.",
      "Nygard, M. (2011). \"Documenting Architecture Decisions\". cognitect.com.",
      "Kleppmann, M. (2017). \"Designing Data-Intensive Applications\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "architecture-review-board",
        "type": "complement"
      },
      {
        "slug": "engineering-ladder",
        "type": "complement"
      }
    ]
  },
  {
    "id": 216,
    "name": "Guilds and Communities of Practice",
    "name_zh": "行会与实践社区",
    "slug": "guilds-communities-of-practice",
    "category": "team",
    "desc": "Cross-team knowledge-sharing groups that build expertise and standards across organizational silos",
    "desc_zh": "跨团队知识共享群体，跨组织孤岛构建专业知识和标准",
    "steps": [
      "Identify a technical or practice domain (frontend, security, data engineering, accessibility) that spans multiple teams and would benefit from cross-team coordination",
      "Appoint a guild lead (voluntary or designated) who owns the meeting cadence, agenda, and communication channels; this role rotates periodically to prevent single-point-of-failure",
      "Establish recurring touchpoints: a monthly meeting for knowledge sharing (demos, post-mortems, book clubs) and an async channel (Slack, Discord) for daily questions and resource sharing",
      "Define the guild's output artifacts: shared coding standards, reviewed libraries, onboarding guides, and recommended tooling that teams adopt voluntarily",
      "Measure the guild's health annually through member satisfaction surveys and track adoption of guild-produced standards to demonstrate value to leadership"
    ],
    "steps_zh": [
      "识别跨多个团队的技术或实践领域（前端、安全、数据工程、可访问性），跨团队协调将带来价值",
      "任命行会负责人（自愿或指定），负责会议节奏、议程和沟通渠道；该角色定期轮换以防止单点故障",
      "建立定期接触点：每月知识共享会议（演示、事后分析、读书会）和异步频道（Slack、Discord）用于日常问题和资源分享",
      "定义行会的输出产物：共享编码标准、审查的库、入职指南和推荐工具，团队自愿采用",
      "通过成员满意度调查每年评估行会健康状况，并跟踪行会制定标准的采用情况以向领导层证明价值"
    ],
    "ai_relevant": false,
    "viz_type": "cycle",
    "viz_labels": [
      "Domain Identify",
      "Guild Lead",
      "Recurring Touchpoints",
      "Output Artifacts",
      "Health Metrics"
    ],
    "viz_labels_zh": [
      "领域识别",
      "组长",
      "定期活动",
      "输出产物",
      "健康指标"
    ],
    "related": [
      "spotify-model",
      "platform-engineering"
    ],
    "tags": [
      "guild",
      "community-of-practice",
      "knowledge-sharing",
      "cross-team",
      "standards"
    ],
    "origin_author": "Etienne Wenger, Communities of Practice (1998); popularized in software by the Spotify Model (2012)",
    "origin_source": "Wenger, E. (1998). \"Communities of Practice: Learning, Meaning, and Identity\". Cambridge University Press; Kniberg, H. & Ivarsson, A. (2012). \"Scaling Agile @ Spotify\".",
    "origin_source_zh": "Wenger, E.（1998）「实践社区：学习、意义与认同」，剑桥大学出版社；Kniberg, H. & Ivarsson, A.（2012）「Spotify的规模化敏捷」",
    "complexity": "beginner",
    "when_to_use": [
      "When the organization has grown beyond 3-4 teams and domain expertise is siloed within individual squads rather than shared across the organization",
      "When the same technical problems (security vulnerabilities, performance issues, accessibility failures) are solved independently by multiple teams with inconsistent outcomes",
      "When senior engineers are embedded in delivery teams with no mechanism to share their expertise at the organizational level",
      "When the organization is adopting new technology (AI, observability, platform engineering) and needs a coordinated learning and adoption path across teams"
    ],
    "when_to_use_zh": [
      "当组织已超过3-4个团队，领域专业知识被孤立在各个小组内而非跨组织共享时",
      "当相同的技术问题（安全漏洞、性能问题、可访问性失败）被多个团队独立解决，结果不一致时",
      "当高级工程师嵌入交付团队，没有在组织层面分享专业知识的机制时",
      "当组织正在采用新技术（AI、可观测性、平台工程）并需要跨团队协调学习和采用路径时"
    ],
    "core_concepts": [
      "Domain: The shared area of interest or expertise that defines the guild's scope — narrow enough to be coherent but broad enough to span multiple teams",
      "Community: The group of practitioners who care about the domain and voluntarily participate in the guild's activities, regardless of their squad or reporting line",
      "Practice: The shared repertoire of tools, standards, patterns, and processes that the guild develops and curates over time",
      "Voluntary Participation: Guilds are opt-in; mandatory guilds become bureaucratic committees that produce compliance theater rather than genuine knowledge sharing",
      "Influence Without Authority: Guild leads must influence team behavior through demonstrated expertise and useful artifacts rather than through organizational hierarchy"
    ],
    "core_concepts_zh": [
      "领域：定义行会范围的共同兴趣或专业知识领域——范围足够窄以保持连贯性，但足够宽以跨越多个团队",
      "社区：关心该领域并自愿参与行会活动的实践者群体，无论其所在小组或汇报关系",
      "实践：行会随时间开发和策划的共享工具、标准、模式和流程库",
      "自愿参与：行会是可选的；强制性行会变成产生合规表演而非真正知识共享的官僚委员会",
      "无权力的影响力：行会负责人必须通过展示的专业知识和有用的产物而非通过组织层级来影响团队行为"
    ],
    "timeline": [
      [
        "1991",
        "Jean Lave and Etienne Wenger coin 'communities of practice' studying apprenticeship learning in craft trades"
      ],
      [
        "1998",
        "Wenger's book 'Communities of Practice' formalizes the theory for organizational knowledge management"
      ],
      [
        "2012",
        "Spotify's engineering model paper introduces 'guilds' as a specific implementation for software engineering organizations"
      ],
      [
        "2016",
        "Major tech companies (Google, Amazon, Zalando, ING) adopt guilds as a formal organizational layer alongside squads and chapters"
      ],
      [
        "2022",
        "Remote-first guilds emerge as a primary mechanism for maintaining cross-team connection and knowledge sharing in distributed organizations"
      ]
    ],
    "timeline_zh": [
      [
        "1991",
        "Jean Lave和Etienne Wenger在研究手工艺学徒制学习时创造了「实践社区」一词"
      ],
      [
        "1998",
        "Wenger的「实践社区」一书为组织知识管理正式化了该理论"
      ],
      [
        "2012",
        "Spotify工程模型论文引入「行会」作为软件工程组织的具体实现"
      ],
      [
        "2016",
        "主要科技公司（Google、Amazon、Zalando、ING）将行会作为小组和章节之外的正式组织层次采用"
      ],
      [
        "2022",
        "远程优先行会成为在分布式组织中维持跨团队联系和知识共享的主要机制"
      ]
    ],
    "dos": [
      "Do keep guild membership voluntary because mandatory guilds create attendance compliance rather than genuine engagement",
      "Do produce concrete, usable artifacts (linting configs, Docker base images, onboarding guides) because abstract knowledge sharing without deliverables doesn't change daily practice",
      "Do rotate guild leads every 12-18 months because static leadership creates knowledge monopolies and prevents the next generation of experts from developing",
      "Do give guilds a dedicated time allocation (e.g., 10% of sprint capacity) so that participation doesn't exclusively happen as unpaid overtime"
    ],
    "dos_zh": [
      "保持行会成员资格自愿，因为强制性行会产生出勤合规而非真正的参与",
      "产出具体、可用的产物（代码检查配置、Docker基础镜像、入职指南），因为没有可交付成果的抽象知识共享不会改变日常实践",
      "每12-18个月轮换行会负责人，因为静态领导层创造知识垄断并阻止下一代专家的发展",
      "给行会分配专用时间（如迭代容量的10%），使参与不仅仅作为无偿加班发生"
    ],
    "donts": [
      "Don't let guilds become toothless talking shops that meet monthly, produce slide decks, and have no impact on how teams actually work",
      "Don't create guilds for every topic because guild proliferation causes context-switching fatigue and dilutes participation across all guilds",
      "Don't measure guild success by attendance alone because a 50-person guild with zero adopted standards is less valuable than a 10-person guild whose patterns are used everywhere",
      "Don't conflate guilds with chapters — chapters own career development and competency within a discipline, guilds focus on cross-team knowledge sharing"
    ],
    "donts_zh": [
      "不要让行会成为每月开会、生产幻灯片、对团队实际工作方式没有影响的无实质内容的讨论场所",
      "不要为每个主题创建行会，因为行会泛滥会导致上下文切换疲劳并稀释所有行会的参与度",
      "不要仅通过出勤率来衡量行会成功，因为一个有50人但标准采用率为零的行会不如一个有10人但其模式被广泛使用的行会有价值",
      "不要将行会与章节混淆——章节负责某学科内的职业发展和能力，行会专注于跨团队知识共享"
    ],
    "case_study_company": "ING Bank",
    "case_study": "ING Bank adopted the Spotify model in 2015, organizing its 13,000-person IT department into squads, tribes, chapters, and guilds. The bank's Security Guild became a flagship example: a 200-engineer voluntary community that produced shared security testing frameworks, threat modeling templates, and a weekly vulnerability digest. Within 18 months of the guild's formation, security issue detection rates improved 40% and remediation time dropped from 45 days to 11 days, attributed directly to the shared knowledge and tools the guild produced. ING later expanded the model to non-engineering functions including data science, UX, and product management.",
    "case_study_zh": "ING银行于2015年采用Spotify模型，将其13000人的IT部门组织成小组、部落、章节和行会。该银行的安全行会成为旗舰示例：一个由200名工程师组成的自愿社区，生产了共享安全测试框架、威胁建模模板和每周漏洞摘要。在行会成立后的18个月内，安全问题检测率提高了40%，修复时间从45天降至11天，这直接归功于行会产生的共享知识和工具。ING后来将该模型扩展到非工程职能，包括数据科学、UX和产品管理。",
    "when_not_to_use": [
      "Organizations of fewer than 20-30 engineers where everyone already knows each other and cross-team knowledge sharing happens organically",
      "High-urgency delivery phases (pre-launch sprints, critical incident recovery) where all capacity must focus on delivery and community activities must pause",
      "Organizations with highly contractual employment relationships where voluntary after-hours participation is inappropriate or legally ambiguous",
      "Domains where knowledge sharing requires security clearances or confidentiality that prevents open cross-team discussion"
    ],
    "when_not_to_use_zh": [
      "少于20-30名工程师的组织，每个人已经认识对方，跨团队知识共享有机发生",
      "高紧迫度交付阶段（发布前冲刺、关键事故恢复），所有容量必须专注于交付，社区活动必须暂停",
      "具有高度契约性雇佣关系的组织，自愿的非工时参与是不恰当的或法律上模糊的",
      "知识共享需要安全许可或保密性的领域，这阻止了开放的跨团队讨论"
    ],
    "adopters": [
      "Spotify",
      "ING Bank",
      "Zalando",
      "Adidas",
      "Google"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Wenger, E. (1998). \"Communities of Practice: Learning, Meaning, and Identity\". Cambridge University Press.",
    "secondary_sources": [
      "Kniberg, H. & Ivarsson, A. (2012). \"Scaling Agile @ Spotify\". Spotify R&D whitepaper.",
      "Wenger, E., McDermott, R. & Snyder, W.M. (2002). \"Cultivating Communities of Practice\". Harvard Business School Press.",
      "Skelton, M. & Pais, M. (2019). \"Team Topologies\", Ch. 7. IT Revolution Press."
    ],
    "typed_relations": [
      {
        "slug": "spotify-model",
        "type": "extends"
      },
      {
        "slug": "platform-engineering",
        "type": "complement"
      }
    ]
  },
  {
    "id": 217,
    "name": "On-Call Rotation Design",
    "name_zh": "值班轮换设计",
    "slug": "on-call-rotation-design",
    "category": "team",
    "desc": "Fair, sustainable on-call schedules and escalation policies that protect engineer wellbeing",
    "desc_zh": "保护工程师福祉的公平、可持续值班计划和升级策略",
    "steps": [
      "Define the service ownership scope for the on-call rotation: which services, alerts, and runbooks belong to the on-call engineer and what constitutes a page-worthy incident",
      "Design the rotation schedule: determine the rotation length (weekly is most common), the number of engineers in the pool (minimum 4-6 to allow reasonable frequency), and whether a secondary on-call provides backup",
      "Set escalation policies: define how long the primary on-call has to acknowledge an alert before it escalates to the secondary, and from secondary to team lead, with explicit timeouts",
      "Establish on-call compensation and working norms: time-off in lieu (TOIL) for disturbed nights, a hard limit on night pages per on-call shift, and a mandatory rest period after a severe incident",
      "Run a quarterly on-call retrospective reviewing alert volume, false positive rate, mean time to acknowledge (MTTA), and engineer satisfaction to drive continuous improvement"
    ],
    "steps_zh": [
      "定义值班轮换的服务所有权范围：哪些服务、告警和运行手册属于值班工程师，什么构成值得呼叫的事件",
      "设计轮换计划：确定轮换长度（每周最为常见）、轮换池中的工程师数量（至少4-6人以保证合理频率），以及是否由二线值班提供备份",
      "设定升级策略：定义一线值班工程师在告警升级到二线之前有多长时间确认，以及从二线升级到团队负责人的时间，包含明确的超时时间",
      "建立值班补偿和工作规范：因被打扰的夜晚给予调休（TOIL），每个值班班次的夜间呼叫硬性上限，以及严重事故后的强制休息期",
      "运行季度值班回顾，审查告警量、误报率、平均确认时间（MTTA）和工程师满意度，以推动持续改进"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Service Scope",
      "Rotation Schedule",
      "Escalation Policy",
      "Compensation Norms",
      "Retrospective"
    ],
    "viz_labels_zh": [
      "服务范围",
      "轮值计划",
      "升级策略",
      "补偿规范",
      "定期回顾"
    ],
    "related": [
      "blameless-postmortems",
      "runbook-automation",
      "engineering-metrics-dashboard"
    ],
    "tags": [
      "on-call",
      "rotation",
      "incident-response",
      "escalation",
      "sustainability"
    ],
    "origin_author": "SRE practices (Google, 2003); PagerDuty escalation model (2010); Charity Majors and Alice Goldfuss advocacy for sustainable on-call",
    "origin_source": "Beyer, B. et al. (2016). \"Site Reliability Engineering\", Ch. 11. Google / O'Reilly; Majors, C. (2019). \"On-Call Shouldn't Suck\". honeycomb.io blog.",
    "origin_source_zh": "Beyer, B.等（2016）「网站可靠性工程」，第11章，Google/O'Reilly；Majors, C.（2019）「值班不应该很糟糕」，honeycomb.io博客",
    "complexity": "intermediate",
    "when_to_use": [
      "When a service has production SLOs that require human intervention within minutes and must be monitored 24/7",
      "When the team is growing past 6-8 engineers and informal 'ping the person who built it' escalation is creating burnout for individual contributors",
      "When incident response times are inconsistent because there is no defined first-responder and anyone might respond (or no one does)",
      "When on-call burn rate is causing attrition and engineers cite on-call burden as a primary reason for leaving the organization"
    ],
    "when_to_use_zh": [
      "当服务有需要几分钟内人工干预且必须24/7监控的生产SLO时",
      "当团队超过6-8名工程师，非正式的「联系构建它的人」升级方式导致个人贡献者倦怠时",
      "当事故响应时间不一致，因为没有定义的第一响应者，任何人都可能响应（或没有人响应）时",
      "当值班消耗率导致人员流失，工程师将值班负担列为离开组织的主要原因时"
    ],
    "core_concepts": [
      "Primary and Secondary On-Call: A two-tier model where primary handles the initial alert and secondary acts as backup if primary doesn't acknowledge within the timeout period",
      "Escalation Policy: A defined sequence of contacts and timeouts that ensure an alert is never silently dropped — it always reaches a human who can act on it",
      "MTTA and MTTR: Mean Time to Acknowledge (MTTA) measures responsiveness; Mean Time to Resolve (MTTR) measures effectiveness; both are tracked per on-call rotation",
      "Alert Fatigue: The cumulative effect of too many low-signal alerts that trains responders to ignore pages, leading to missed critical incidents",
      "Sustainable On-Call: The principle that on-call should consume less than 25% of an engineer's working time (SRE guideline) to allow focus on improvement work"
    ],
    "core_concepts_zh": [
      "一线和二线值班：两级模型，一线处理初始告警，二线在一线未在超时期限内确认时作为备份",
      "升级策略：定义的联系人序列和超时，确保告警永远不会被静默丢弃——它总会到达能够采取行动的人",
      "MTTA和MTTR：平均确认时间（MTTA）衡量响应能力；平均解决时间（MTTR）衡量有效性；两者都按值班轮换跟踪",
      "告警疲劳：过多低信号告警的累积效应，训练响应者忽略呼叫，导致错过关键事故",
      "可持续值班：值班消耗应少于工程师工作时间25%的原则（SRE指导方针），以允许专注于改进工作"
    ],
    "timeline": [
      [
        "2003",
        "Google SRE team formalizes on-call rotation practices including the 50% toil limit and escalation policies for production services"
      ],
      [
        "2010",
        "PagerDuty founded, providing the first SaaS platform specifically designed for on-call scheduling and alert routing"
      ],
      [
        "2016",
        "Google's SRE book (O'Reilly) publishes on-call practices as industry standard, including the rule that on-call burden must not exceed 25% of time"
      ],
      [
        "2019",
        "Charity Majors and Alice Goldfuss publish 'Database Reliability Engineering' and blog extensively on sustainable on-call culture"
      ],
      [
        "2022",
        "Incident.io, Rootly, and Opsgenie advance on-call tooling with AI-powered alert correlation and automatic runbook suggestions"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "Google SRE团队正式化值班轮换实践，包括50%琐事上限和生产服务的升级策略"
      ],
      [
        "2010",
        "PagerDuty成立，提供第一个专门为值班排班和告警路由设计的SaaS平台"
      ],
      [
        "2016",
        "Google的SRE书（O'Reilly）将值班实践作为行业标准发布，包括值班负担不得超过25%时间的规则"
      ],
      [
        "2019",
        "Charity Majors和Alice Goldfuss发布「数据库可靠性工程」并广泛撰写关于可持续值班文化的博客"
      ],
      [
        "2022",
        "Incident.io、Rootly和Opsgenie通过AI驱动的告警关联和自动运行手册建议推进值班工具"
      ]
    ],
    "dos": [
      "Do ensure the on-call rotation pool has at least 4-6 engineers so that no individual is on-call more than one week in four",
      "Do compensate engineers for on-call time through explicit pay, time-off in lieu, or reduced sprint commitments because unpaid on-call creates resentment and attrition",
      "Do conduct post-incident reviews for every severity-1 incident and track the actionable improvements through to completion to prevent recurring pages",
      "Do set a maximum number of night pages per on-call shift (e.g., 5 per week) as a hard organizational commitment, not a guideline"
    ],
    "dos_zh": [
      "确保值班轮换池至少有4-6名工程师，使任何个人的值班频率不超过每四周一次",
      "通过明确的薪酬、调休或减少迭代承诺来补偿工程师的值班时间，因为无偿值班会产生怨恨和人员流失",
      "对每个S1级事故进行事后审查，并跟踪可操作的改进直到完成，以防止重复告警",
      "将每个值班班次的夜间呼叫最大数量（如每周5次）设为硬性组织承诺，而非指导方针"
    ],
    "donts": [
      "Don't put engineers on-call for services they didn't build and haven't been properly trained on because it creates dangerous guesswork during incidents",
      "Don't treat on-call as a rite of passage or proof of seniority because it normalizes unsustainable practices and drives away excellent engineers who have alternatives",
      "Don't ignore alert volume as a metric because teams that normalize high alert volumes are training themselves for alert fatigue and missed critical incidents",
      "Don't allow on-call engineers to stay on a page without escalation indefinitely; set hard timeouts because delayed escalation extends incident duration"
    ],
    "donts_zh": [
      "不要让工程师值班他们没有构建且没有经过适当培训的服务，因为这在事故中产生危险的猜测",
      "不要将值班视为成人礼或资历证明，因为这使不可持续的实践正常化，并驱走有其他选择的优秀工程师",
      "不要忽视告警量作为指标，因为使高告警量正常化的团队正在训练自己产生告警疲劳并错过关键事故",
      "不要允许值班工程师无限期停留在呼叫上而不升级；设置硬性超时，因为延迟升级会延长事故持续时间"
    ],
    "case_study_company": "Google SRE",
    "case_study": "Google's Site Reliability Engineering team codified on-call best practices that became the industry standard. Their model limits on-call burden to 25% of an SRE's working time (including both the on-call shift and follow-up work), with the remainder dedicated to engineering improvements. Google's SRE book documented that when on-call load exceeded 25%, SRE teams experienced measurable increases in burnout, errors, and attrition. The escalation policy model — primary acknowledges within 5 minutes or secondary is paged, secondary within 10 minutes or management is called — has been adopted by PagerDuty, Opsgenie, and Incident.io as the default template for new customers.",
    "case_study_zh": "Google的网站可靠性工程团队编纂了成为行业标准的值班最佳实践。他们的模型将值班负担限制在SRE工作时间的25%（包括值班班次和后续工作），其余时间专注于工程改进。Google的SRE书记录了当值班负载超过25%时，SRE团队经历了可测量的倦怠、错误和人员流失增加。升级策略模型——一线在5分钟内确认，否则呼叫二线，二线在10分钟内确认，否则通知管理层——已被PagerDuty、Opsgenie和Incident.io采用为新客户的默认模板。",
    "when_not_to_use": [
      "Internal tooling or development services with no user-facing SLOs where an 8x5 support model with next-business-day response is acceptable",
      "Services in early development where stability is not yet expected and alerting produces more noise than signal",
      "Teams of fewer than 4 engineers where sustainable rotation is impossible and alternative architectures (using managed services to reduce operational burden) should be prioritized",
      "Non-critical internal services where the cost of a 24/7 on-call rotation exceeds the cost of occasional downtime"
    ],
    "when_not_to_use_zh": [
      "没有面向用户SLO的内部工具或开发服务，8x5支持模型和下一个工作日响应是可接受的",
      "早期开发中的服务，尚不期望稳定性，告警产生的噪音多于信号",
      "少于4名工程师的团队，可持续轮换是不可能的，应优先考虑替代架构（使用托管服务减少运营负担）",
      "非关键内部服务，24/7值班轮换的成本超过偶尔停机的成本"
    ],
    "adopters": [
      "Google",
      "Netflix",
      "Stripe",
      "Atlassian",
      "PagerDuty"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Beyer, B., Jones, C., Petoff, J. & Murphy, N.R. (2016). \"Site Reliability Engineering\", Ch. 11. O'Reilly Media.",
    "secondary_sources": [
      "Majors, C. (2019). \"On-Call Shouldn't Suck: A Guide for Managers\". honeycomb.io.",
      "PagerDuty (2023). \"Incident Response Operational Guide\". response.pagerduty.com.",
      "Blank-Edelman, D. (2018). \"Seeking SRE\", Ch. 18-19. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "blameless-postmortems",
        "type": "complement"
      },
      {
        "slug": "runbook-automation",
        "type": "complement"
      },
      {
        "slug": "engineering-metrics-dashboard",
        "type": "complement"
      }
    ]
  },
  {
    "id": 218,
    "name": "Engineering Metrics Dashboard",
    "name_zh": "工程指标仪表板",
    "slug": "engineering-metrics-dashboard",
    "category": "team",
    "desc": "DORA metrics, developer satisfaction, and quality metrics unified in the SPACE framework",
    "desc_zh": "在SPACE框架中统一的DORA指标、开发者满意度和质量指标",
    "steps": [
      "Select the metric framework: DORA (Deployment Frequency, Lead Time for Changes, Change Failure Rate, Time to Restore Service) plus SPACE dimensions (Satisfaction, Performance, Activity, Communication, Efficiency)",
      "Instrument the delivery pipeline to collect DORA metrics automatically: connect CI/CD, incident management, and version control systems to a metrics aggregation platform (LinearB, Jellyfish, Sleuth, or custom)",
      "Survey developers quarterly using validated satisfaction instruments (Developer Experience Index, DevEx survey) to capture the subjective SPACE dimensions that pipeline data cannot measure",
      "Build a dashboard visible to teams, managers, and leadership that shows trends over time (not just current snapshots) and highlights leading indicators like PR cycle time alongside lagging indicators like MTTR",
      "Hold a monthly metrics review with the team to interpret trends, set improvement targets for the next quarter, and connect metric improvements to engineering investments"
    ],
    "steps_zh": [
      "选择指标框架：DORA（部署频率、变更前置时间、变更失败率、服务恢复时间）加上SPACE维度（满意度、性能、活动、沟通、效率）",
      "对交付流水线进行仪器化以自动收集DORA指标：将CI/CD、事故管理和版本控制系统连接到指标聚合平台（LinearB、Jellyfish、Sleuth或自定义）",
      "使用经过验证的满意度工具（开发者体验指数、DevEx调查）每季度调查开发者，捕获流水线数据无法衡量的主观SPACE维度",
      "构建对团队、管理者和领导层可见的仪表板，显示随时间的趋势（而不仅仅是当前快照），并突出显示PR周期时间等先导指标以及MTTR等滞后指标",
      "与团队进行月度指标回顾，解读趋势，为下一季度设定改进目标，并将指标改进与工程投资联系起来"
    ],
    "ai_relevant": true,
    "viz_type": "radar",
    "viz_labels": [
      "Deployment Freq",
      "Lead Time",
      "Change Failure Rate",
      "MTTR",
      "Satisfaction"
    ],
    "viz_labels_zh": [
      "部署频率",
      "交付周期",
      "变更失败率",
      "恢复时间",
      "满意度"
    ],
    "related": [
      "blameless-postmortems"
    ],
    "tags": [
      "dora",
      "space-framework",
      "metrics",
      "developer-experience",
      "engineering-effectiveness"
    ],
    "origin_author": "DORA research (Nicole Forsgren et al., 2018); SPACE framework (Forsgren et al., 2021, ACM Queue)",
    "origin_source": "Forsgren, N., Humble, J. & Kim, G. (2018). \"Accelerate\". IT Revolution Press; Forsgren, N. et al. (2021). \"The SPACE of Developer Productivity\". ACM Queue.",
    "origin_source_zh": "Forsgren, N., Humble, J. & Kim, G.（2018）「加速」，IT Revolution Press；Forsgren, N.等（2021）「开发者生产力的SPACE」，ACM Queue",
    "complexity": "intermediate",
    "when_to_use": [
      "When engineering leadership needs objective data to justify investments in developer tooling, platform engineering, or technical debt reduction",
      "When the organization is scaling rapidly and delivery performance needs to be tracked across multiple teams without relying on anecdotal evidence",
      "When there is disagreement between engineering and product/business about whether engineering capacity is being used effectively",
      "When attrition surveys cite developer experience as a concern and leadership wants to track whether investments in DX are yielding measurable improvement"
    ],
    "when_to_use_zh": [
      "当工程领导层需要客观数据来证明对开发者工具、平台工程或技术债务减少的投资合理时",
      "当组织快速扩展，需要在不依赖轶事证据的情况下跨多个团队跟踪交付性能时",
      "当工程团队与产品/业务之间对工程容量是否被有效使用存在分歧时",
      "当流失率调查将开发者体验列为关注点，领导层希望跟踪DX投资是否产生可衡量的改进时"
    ],
    "core_concepts": [
      "DORA Metrics: Four evidence-backed metrics from the DevOps Research and Assessment program — Deployment Frequency, Lead Time for Changes, Change Failure Rate, and Time to Restore Service — that predict organizational performance",
      "SPACE Framework: Five dimensions of developer productivity (Satisfaction and wellbeing, Performance, Activity, Communication and collaboration, Efficiency and flow) that together provide a holistic view beyond throughput metrics",
      "Elite Performers Benchmark: DORA research defines four performance bands; Elite teams deploy multiple times per day with <1h lead time, <15% change failure rate, and <1h MTTR — a target for continuous improvement",
      "Leading vs Lagging Indicators: PR cycle time and build duration are leading indicators that predict future DORA metrics; MTTR and change failure rate are lagging indicators of actual production outcomes",
      "Goodhart's Law Risk: When a measure becomes a target, it ceases to be a good measure — engineering metrics must be used for learning and improvement, not as performance evaluation for individual engineers"
    ],
    "core_concepts_zh": [
      "DORA指标：来自DevOps研究与评估项目的四个有证据支持的指标——部署频率、变更前置时间、变更失败率和服务恢复时间——预测组织绩效",
      "SPACE框架：开发者生产力的五个维度（满意度和幸福感、性能、活动、沟通与协作、效率和流状态），共同提供超越吞吐量指标的整体视图",
      "精英表现者基准：DORA研究定义了四个绩效档次；精英团队每天部署多次，前置时间<1小时，变更失败率<15%，MTTR<1小时——这是持续改进的目标",
      "先导指标与滞后指标：PR周期时间和构建时长是预测未来DORA指标的先导指标；MTTR和变更失败率是实际生产结果的滞后指标",
      "古德哈特定律风险：当一个指标成为目标时，它就不再是好的指标——工程指标必须用于学习和改进，而不是作为对个别工程师的绩效评估"
    ],
    "timeline": [
      [
        "2014",
        "DORA (DevOps Research and Assessment) founded by Nicole Forsgren, Jez Humble, and Gene Kim to study software delivery performance"
      ],
      [
        "2018",
        "Accelerate book publishes DORA research findings, establishing the four key metrics as industry standard for software delivery performance"
      ],
      [
        "2019",
        "Google acquires DORA; the annual State of DevOps Report becomes the definitive benchmark for engineering performance across industries"
      ],
      [
        "2021",
        "SPACE framework published in ACM Queue, extending productivity measurement beyond pipeline metrics to include developer satisfaction and flow"
      ],
      [
        "2023",
        "McKinsey Developer Productivity report triggers industry debate; DORA and SPACE researchers respond, reinforcing multi-dimensional measurement over simplistic metrics"
      ]
    ],
    "timeline_zh": [
      [
        "2014",
        "DORA（DevOps研究与评估）由Nicole Forsgren、Jez Humble和Gene Kim创立，研究软件交付性能"
      ],
      [
        "2018",
        "「加速」一书发布DORA研究成果，将四个关键指标确立为软件交付性能的行业标准"
      ],
      [
        "2019",
        "Google收购DORA；年度DevOps状态报告成为跨行业工程性能的权威基准"
      ],
      [
        "2021",
        "SPACE框架发表在ACM Queue，将生产力测量从流水线指标扩展到包括开发者满意度和流状态"
      ],
      [
        "2023",
        "麦肯锡开发者生产力报告引发行业辩论；DORA和SPACE研究人员回应，强调多维度测量而非简单化指标"
      ]
    ],
    "dos": [
      "Do measure DORA metrics at the team level, not the individual level, because they are systems-level indicators of team and organizational performance",
      "Do combine quantitative pipeline metrics with qualitative developer surveys because throughput metrics without satisfaction data miss half the picture",
      "Do share dashboards transparently with teams rather than only with management because teams improve faster when they can see their own metrics",
      "Do use metrics to identify systemic investment opportunities (slow CI, flaky tests, manual deployments) rather than to evaluate individual engineers"
    ],
    "dos_zh": [
      "在团队层面而非个人层面测量DORA指标，因为它们是团队和组织绩效的系统级指标",
      "将定量流水线指标与定性开发者调查结合起来，因为没有满意度数据的吞吐量指标只看到了一半情况",
      "向团队透明地共享仪表板，而不仅仅与管理层共享，因为当团队能看到自己的指标时，他们改进更快",
      "使用指标来识别系统性投资机会（缓慢的CI、不稳定的测试、手动部署），而不是评估个别工程师"
    ],
    "donts": [
      "Don't use DORA metrics as a performance management tool for individual engineers because it drives gaming behavior and destroys the trust needed for honest reporting",
      "Don't track activity metrics (commits per day, PRs merged) as proxies for productivity because they incentivize quantity over quality and discourage large, impactful changes",
      "Don't compare metrics across teams without controlling for domain complexity because a team maintaining a legacy payment system will have different metrics than a team building a new microservice",
      "Don't treat metrics as a substitute for direct conversation with engineers about what is slowing them down"
    ],
    "donts_zh": [
      "不要将DORA指标用作对个别工程师的绩效管理工具，因为这会驱动博弈行为并破坏诚实报告所需的信任",
      "不要将活动指标（每天提交次数、合并的PR）作为生产力的代理，因为它们激励数量而非质量，并阻碍大型有影响力的变更",
      "不要在不控制领域复杂度的情况下跨团队比较指标，因为维护遗留支付系统的团队与构建新微服务的团队会有不同的指标",
      "不要将指标视为与工程师直接对话关于什么在减慢他们速度的替代品"
    ],
    "case_study_company": "Puppet / DORA State of DevOps",
    "case_study": "The DORA State of DevOps research program, spanning 2014-2023 with over 33,000 survey respondents across industries, validated that elite software delivery performance (high deployment frequency, low change failure rate, fast MTTR) is statistically correlated with better organizational outcomes — higher profitability, market share growth, and employee satisfaction. Companies that adopted DORA metrics and tracked them as team health indicators, rather than management KPIs, consistently outperformed their peers. The research also found that psychological safety — the ability for engineers to report failures without fear — was the strongest predictor of both high DORA performance and innovation outcomes.",
    "case_study_zh": "DORA DevOps状态研究项目跨越2014-2023年，有超过33000名跨行业调查受访者，验证了精英软件交付性能（高部署频率、低变更失败率、快速MTTR）与更好的组织结果统计相关——更高的盈利能力、市场份额增长和员工满意度。将DORA指标作为团队健康指标而非管理KPI进行跟踪的公司，始终优于同行。研究还发现，心理安全——工程师在没有恐惧的情况下报告失败的能力——是高DORA绩效和创新结果的最强预测因子。",
    "when_not_to_use": [
      "Early-stage startups where the engineering team is fewer than 5 people and the overhead of measurement exceeds its benefit",
      "Research and innovation teams where the goal is exploration rather than delivery throughput, and DORA metrics would incentivize the wrong behaviors",
      "Organizations where metrics data is consistently used punitively, because introducing metrics in a low-trust environment will cause gaming rather than improvement",
      "Short-duration project teams (under 6 months) where there is insufficient time to establish baselines and observe trend improvements"
    ],
    "when_not_to_use_zh": [
      "早期创业公司，工程团队少于5人，测量开销超过其收益",
      "研究和创新团队，目标是探索而非交付吞吐量，DORA指标会激励错误的行为",
      "指标数据持续被用于惩罚性目的的组织，因为在低信任环境中引入指标会导致博弈而非改进",
      "持续时间短的项目团队（少于6个月），没有足够时间建立基准和观察趋势改进"
    ],
    "adopters": [
      "Google",
      "Microsoft",
      "Capital One",
      "Nordstrom",
      "ING Bank"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "reliability",
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Forsgren, N., Humble, J. & Kim, G. (2018). \"Accelerate: The Science of Lean Software and DevOps\". IT Revolution Press.",
    "secondary_sources": [
      "Forsgren, N. et al. (2021). \"The SPACE of Developer Productivity\". ACM Queue, Vol. 19.",
      "DORA (2023). \"State of DevOps Report 2023\". dora.dev.",
      "Greening, D. & Keane, S. (2021). \"Engineering Effectiveness at Shopify\". shopify.engineering."
    ],
    "typed_relations": [
      {
        "slug": "blameless-postmortems",
        "type": "complement"
      }
    ]
  },
  {
    "id": 219,
    "name": "Technical Writing as a Practice",
    "name_zh": "技术写作实践",
    "slug": "technical-writing-as-a-practice",
    "category": "team",
    "desc": "Documentation standards, style guides, and docs-as-code workflows for engineering organizations",
    "desc_zh": "工程组织的文档标准、风格指南和文档即代码工作流",
    "steps": [
      "Adopt a documentation taxonomy (the Divio/Diataxis framework: tutorials, how-to guides, explanations, and reference) and map existing documentation to the four quadrants to identify gaps",
      "Establish a docs-as-code workflow: store documentation in Git alongside code, use Markdown or AsciiDoc, review documentation in PRs, and publish via static site generators (Docusaurus, MkDocs, Sphinx)",
      "Create a style guide (or adopt an existing one: Google Developer Documentation Style Guide, Microsoft Writing Style Guide) and enforce it with automated prose linters (Vale, textlint)",
      "Define documentation ownership: each service or API has a designated documentation owner responsible for keeping it current; documentation is tracked on the team's definition of done",
      "Measure documentation quality quarterly through developer surveys ('Was the documentation sufficient to complete your task?') and track documentation coverage against the API/service inventory"
    ],
    "steps_zh": [
      "采用文档分类法（Divio/Diataxis框架：教程、操作指南、解释和参考），将现有文档映射到四个象限以识别差距",
      "建立文档即代码工作流：将文档与代码一起存储在Git中，使用Markdown或AsciiDoc，在PR中审查文档，并通过静态站点生成器（Docusaurus、MkDocs、Sphinx）发布",
      "创建风格指南（或采用现有的：Google开发者文档风格指南、Microsoft写作风格指南），并使用自动化散文代码检查工具（Vale、textlint）强制执行",
      "定义文档所有权：每个服务或API有指定的文档所有者负责保持其最新；文档在团队的完成定义中被跟踪",
      "通过开发者调查每季度衡量文档质量（「文档是否足以完成您的任务？」），并跟踪API/服务清单中的文档覆盖率"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Tutorials",
      "How-To Guides",
      "Explanations",
      "Reference",
      "Quality Metrics"
    ],
    "viz_labels_zh": [
      "教程",
      "操作指南",
      "解释说明",
      "参考文档",
      "质量度量"
    ],
    "related": [
      "platform-engineering",
      "rfc-process"
    ],
    "tags": [
      "documentation",
      "docs-as-code",
      "divio",
      "style-guide",
      "technical-writing"
    ],
    "origin_author": "Divio documentation system (Daniele Procida, 2017); docs-as-code popularized by Anne Gentle (2015)",
    "origin_source": "Procida, D. (2017). \"The Documentation System\". divio.com; Gentle, A. (2015). \"Docs Like Code\". lulu.com; Diataxis framework (diataxis.fr).",
    "origin_source_zh": "Procida, D.（2017）「文档系统」，divio.com；Gentle, A.（2015）「像代码一样的文档」；Diataxis框架（diataxis.fr）",
    "complexity": "beginner",
    "when_to_use": [
      "When engineering teams spend significant time answering the same questions repeatedly because documentation is absent, outdated, or hard to find",
      "When onboarding new engineers takes weeks longer than expected because institutional knowledge lives in people's heads rather than written documentation",
      "When the API or platform is becoming a product used by external developers who need documentation as good as the code itself",
      "When the team has grown past 20 engineers and informal knowledge transfer through conversation no longer scales"
    ],
    "when_to_use_zh": [
      "当工程团队花费大量时间重复回答相同问题，因为文档缺失、过时或难以找到时",
      "当新工程师入职花费比预期长数周时间，因为机构知识存在于人们的脑海中而非书面文档时",
      "当API或平台正在成为外部开发者使用的产品，需要与代码本身一样好的文档时",
      "当团队超过20名工程师，通过对话进行的非正式知识传递不再具有可扩展性时"
    ],
    "core_concepts": [
      "Diataxis Framework: Four documentation types serving different user needs — Tutorials (learning-oriented, taking the user through a complete example), How-To Guides (task-oriented, solving a specific problem), Explanations (understanding-oriented, discussing concepts), and Reference (information-oriented, describing the system accurately)",
      "Docs as Code: Treating documentation with the same rigor as code — stored in Git, reviewed in PRs, tested for accuracy, versioned alongside releases, and built/deployed through CI/CD pipelines",
      "Style Guide: A shared standard for voice, tone, formatting, terminology, and code sample conventions that makes documentation feel consistent regardless of who wrote it",
      "Documentation Debt: The accumulated deficit of missing, outdated, or inaccurate documentation that creates a hidden tax on every engineer who uses the underdocumented system",
      "The Documentation Triangle: For every release, three artifacts must be updated — the code, the tests, and the documentation — treating all three as first-class deliverables"
    ],
    "core_concepts_zh": [
      "Diataxis框架：满足不同用户需求的四种文档类型——教程（面向学习，带领用户完成完整示例）、操作指南（面向任务，解决特定问题）、解释（面向理解，讨论概念）和参考（面向信息，准确描述系统）",
      "文档即代码：以与代码相同的严格程度对待文档——存储在Git中、在PR中审查、测试准确性、随发布版本化，并通过CI/CD流水线构建/部署",
      "风格指南：关于语气、格式、术语和代码示例约定的共享标准，使文档无论谁写都感觉一致",
      "文档债务：缺失、过时或不准确文档的累积赤字，对使用文档不足系统的每个工程师造成隐形税",
      "文档三角：对于每个发布，必须更新三个产物——代码、测试和文档——将三者都视为一级可交付成果"
    ],
    "timeline": [
      [
        "2010",
        "Write the Docs community founded, bringing together technical writers and engineers to develop documentation best practices"
      ],
      [
        "2015",
        "Anne Gentle publishes 'Docs Like Code', popularizing the docs-as-code workflow using Git, Markdown, and static site generators"
      ],
      [
        "2017",
        "Divio publishes 'The Documentation System' (later Diataxis), providing a widely-adopted taxonomy of documentation types"
      ],
      [
        "2019",
        "Google Developer Documentation Style Guide released publicly, becoming a reference standard for technical writing in engineering organizations"
      ],
      [
        "2023",
        "AI-assisted documentation tools (GitHub Copilot for docs, Mintlify Writer) automate docstring generation and keep reference documentation current"
      ]
    ],
    "timeline_zh": [
      [
        "2010",
        "Write the Docs社区成立，汇集技术写作者和工程师开发文档最佳实践"
      ],
      [
        "2015",
        "Anne Gentle发布「像代码一样的文档」，推广使用Git、Markdown和静态站点生成器的文档即代码工作流"
      ],
      [
        "2017",
        "Divio发布「文档系统」（后来的Diataxis），提供被广泛采用的文档类型分类法"
      ],
      [
        "2019",
        "Google开发者文档风格指南公开发布，成为工程组织技术写作的参考标准"
      ],
      [
        "2023",
        "AI辅助文档工具（GitHub Copilot for docs、Mintlify Writer）自动化文档字符串生成并保持参考文档最新"
      ]
    ],
    "dos": [
      "Do use the Diataxis framework to separate tutorials from reference documentation because mixing them produces documentation that serves neither learning nor lookup well",
      "Do enforce documentation updates in the PR review checklist so that documentation debt doesn't accumulate silently with every feature release",
      "Do run automated link-checking and prose linting in CI because broken links and style inconsistencies are caught at merge time rather than discovered by frustrated readers",
      "Do invest in documentation search because even the best-written documentation is useless if engineers cannot find it within 30 seconds"
    ],
    "dos_zh": [
      "使用Diataxis框架将教程与参考文档分离，因为混合它们产生的文档既不适合学习也不适合查找",
      "在PR审查清单中强制执行文档更新，使文档债务不随每个功能发布静默积累",
      "在CI中运行自动化链接检查和散文代码检查，因为断链和风格不一致在合并时被捕获，而不是被沮丧的读者发现",
      "投资文档搜索，因为即使是写得最好的文档，如果工程师不能在30秒内找到它，也是没用的"
    ],
    "donts": [
      "Don't assign documentation as a post-release afterthought because it will never be prioritized over the next feature and the documentation debt will compound",
      "Don't use documentation as a substitute for good API design because if the documentation for an API is long and complex, the API itself is probably too complex",
      "Don't let documentation ownership become 'everyone's problem' because diffuse ownership is the primary cause of stale, inaccurate documentation",
      "Don't optimize documentation for the writer's convenience (dumping raw notes, linking to internal Slack threads, assuming tribal knowledge) — optimize it for a reader encountering the topic for the first time"
    ],
    "donts_zh": [
      "不要将文档视为发布后的事后想法，因为它永远不会比下一个功能优先，文档债务会复利增长",
      "不要使用文档作为良好API设计的替代品，因为如果API的文档又长又复杂，API本身可能过于复杂",
      "不要让文档所有权成为「所有人的问题」，因为分散的所有权是文档过时、不准确的主要原因",
      "不要为写作者的便利优化文档（倾倒原始笔记、链接到内部Slack线程、假设部落知识）——为首次遇到该主题的读者优化"
    ],
    "case_study_company": "Stripe",
    "case_study": "Stripe has consistently ranked at the top of developer experience benchmarks, with its documentation cited as a primary driver. Stripe's documentation approach embodies every docs-as-code principle: all documentation is stored in a monorepo alongside the API code, reviewed in the same PR workflow as code changes, and built through an automated publishing pipeline. Each API endpoint has dedicated tutorial content, code samples in 8 languages generated from a single source, and a reference section. Stripe employs over 20 technical writers who collaborate directly with engineers rather than receiving finished content to document. Internal research showed that new developers who could complete a working integration in under 30 minutes (possible because of documentation quality) had significantly higher activation rates.",
    "case_study_zh": "Stripe在开发者体验基准中始终排名靠前，其文档被列为主要驱动因素。Stripe的文档方法体现了所有文档即代码原则：所有文档与API代码一起存储在单体仓库中，在与代码变更相同的PR工作流中审查，并通过自动化发布流水线构建。每个API端点都有专门的教程内容、从单一来源生成的8种语言代码示例和参考部分。Stripe雇用超过20名技术写作者，他们直接与工程师协作，而不是接收已完成的内容来记录。内部研究表明，能够在30分钟内完成工作集成（由于文档质量而成为可能）的新开发者具有显著更高的激活率。",
    "when_not_to_use": [
      "Internal proof-of-concept projects with a lifespan under 3 months where the documentation investment will not be recouped before the project is discarded",
      "Highly experimental research systems where the design changes faster than documentation can be written and maintained",
      "Proprietary code that will never be shared beyond the immediate team, where informal knowledge transfer through pair programming and code reviews is sufficient",
      "Automated code where inline comments and self-documenting variable names serve as documentation, and separate prose documentation would be redundant"
    ],
    "when_not_to_use_zh": [
      "寿命不足3个月的内部概念验证项目，文档投资在项目被丢弃之前无法得到回报",
      "高度实验性研究系统，设计变化速度比文档可以编写和维护的速度更快",
      "永远不会在直接团队之外共享的专有代码，通过结对编程和代码审查进行的非正式知识传递已经足够",
      "自动化代码，其中内联注释和自文档变量名充当文档，单独的散文文档将是多余的"
    ],
    "adopters": [
      "Stripe",
      "Twilio",
      "Cloudflare",
      "Hashicorp",
      "Netlify"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "usability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Procida, D. (2017). \"The Documentation System\". divio.com / diataxis.fr.",
    "secondary_sources": [
      "Gentle, A. (2015). \"Docs Like Code\". lulu.com.",
      "Google (2019). \"Google Developer Documentation Style Guide\". developers.google.com/style.",
      "Write the Docs community (2023). \"Documentation Best Practices\". writethedocs.org."
    ],
    "typed_relations": [
      {
        "slug": "platform-engineering",
        "type": "complement"
      },
      {
        "slug": "rfc-process",
        "type": "complement"
      }
    ]
  },
  {
    "id": 280,
    "name": "Engineering Effectiveness",
    "name_zh": "工程效能",
    "slug": "engineering-effectiveness",
    "category": "team",
    "desc": "Measuring and improving developer productivity through evidence-based metrics, tooling investment, and systemic friction removal",
    "desc_zh": "通过基于证据的指标、工具投资和系统性摩擦消除来衡量并提升开发者生产力",
    "steps": [
      "Baseline current state by collecting DORA metrics (deployment frequency, lead time, change failure rate, MTTR) alongside developer satisfaction surveys to understand both system throughput and human experience",
      "Identify top friction sources through structured developer interviews and workflow analysis — common culprits are slow CI pipelines, manual toil, unclear ownership, and environment setup delays",
      "Define a prioritized investment roadmap that targets the highest-leverage friction points: fast CI (<10 min), self-service environments, automated onboarding, and reliable test infrastructure",
      "Establish an Engineering Effectiveness function or team (1-2 engineers per 100 developers) responsible for toolchain health, internal developer tooling, and periodic effectiveness surveys",
      "Review metrics quarterly, share dashboards transparently with teams, and treat improvements as team-owned goals rather than top-down mandates"
    ],
    "steps_zh": [
      "通过收集 DORA 指标（部署频率、前置时间、变更失败率、MTTR）以及开发者满意度调查建立基准，了解系统吞吐量和人员体验",
      "通过结构化开发者访谈和工作流分析识别主要摩擦来源——常见问题包括缓慢的 CI 流水线、手动重复劳动、职责不清和环境搭建延迟",
      "制定优先级投资路线图，针对最具杠杆效应的摩擦点：快速 CI（<10 分钟）、自助服务环境、自动化入职和可靠的测试基础设施",
      "建立工程效能职能或团队（每 100 名开发者配置 1-2 名工程师），负责工具链健康、内部开发者工具和定期效能调查",
      "每季度审查指标，向团队透明共享仪表板，将改进视为团队自主目标而非自上而下的指令"
    ],
    "ai_relevant": true,
    "viz_type": "radar",
    "viz_labels": [
      "DORA Baseline",
      "Friction Sources",
      "Investment Roadmap",
      "Effectiveness Team",
      "Quarterly Review"
    ],
    "viz_labels_zh": [
      "DORA基线",
      "摩擦点",
      "投资路线图",
      "效能团队",
      "季度评审"
    ],
    "related": [
      "engineering-metrics-dashboard",
      "developer-experience-framework",
      "platform-engineering",
      "blameless-postmortems"
    ],
    "tags": [
      "developer-productivity",
      "dora-metrics",
      "space-framework",
      "developer-experience",
      "measurement"
    ],
    "origin_author": "Nicole Forsgren",
    "origin_source": "Forsgren, N., Humble, J. & Kim, G. (2018). \"Accelerate\". IT Revolution Press; Forsgren, N. et al. (2021). \"The SPACE of Developer Productivity\". ACM Queue, Vol. 19.",
    "origin_source_zh": "Forsgren, N., Humble, J. & Kim, G.（2018）「加速」，IT Revolution Press；Forsgren, N. 等（2021）「开发者生产力的 SPACE 框架」，ACM Queue，第 19 卷",
    "complexity": "intermediate",
    "when_to_use": [
      "When engineering leadership needs objective evidence to justify tooling or platform investments beyond anecdotal developer complaints",
      "When the organization is scaling past 50 engineers and delivery velocity is perceived to be slowing despite headcount growth",
      "When developer attrition surveys cite frustrating tooling or slow feedback loops as a reason for leaving",
      "When there is tension between product and engineering over whether engineering capacity is being used effectively"
    ],
    "when_to_use_zh": [
      "当工程领导层需要客观证据来证明工具或平台投资合理，而不只是依赖开发者的感性抱怨时",
      "当组织规模超过 50 名工程师，尽管人员增加但交付速度被认为在下降时",
      "当开发者离职调查将令人沮丧的工具或缓慢的反馈循环列为离职原因时",
      "当产品与工程之间对于工程容量是否被有效使用存在张力时"
    ],
    "core_concepts": [
      "DORA Metrics: Four evidence-backed delivery metrics — Deployment Frequency, Lead Time for Changes, Change Failure Rate, and Mean Time to Restore — that predict both team performance and organizational outcomes",
      "SPACE Framework: A multidimensional productivity model covering Satisfaction & wellbeing, Performance, Activity, Communication & collaboration, and Efficiency & flow — preventing over-reliance on any single throughput metric",
      "Developer Cognitive Load: The mental overhead imposed by tooling complexity, unclear processes, and context switching; reducing it is the primary lever for improving individual effectiveness",
      "Toil Elimination: Systematically identifying and automating repetitive, manual, undifferentiated work (build babysitting, manual deployments, ticket triaging) to return engineering time to impactful work",
      "Effectiveness Function: A dedicated internal team or role that owns developer toolchain health, measures friction, and ships productivity improvements — treating developer experience as a first-class product"
    ],
    "core_concepts_zh": [
      "DORA 指标：四个有证据支持的交付指标——部署频率、变更前置时间、变更失败率和平均恢复时间——可预测团队绩效和组织结果",
      "SPACE 框架：多维生产力模型，涵盖满意度与幸福感、性能、活动、沟通与协作、效率与心流——防止过度依赖单一吞吐量指标",
      "开发者认知负荷：工具复杂度、流程不清晰和上下文切换带来的心理开销；降低认知负荷是提升个人效能的主要杠杆",
      "消除重复劳动：系统性识别并自动化重复的、手动的、无差异化的工作（构建监控、手动部署、工单分类），将工程时间还给有影响力的工作",
      "效能职能：专门的内部团队或角色，负责开发者工具链健康、摩擦度量和效能改进交付，将开发者体验视为一等产品"
    ],
    "timeline": [
      [
        "2014",
        "DORA (DevOps Research and Assessment) founded by Nicole Forsgren, Jez Humble, and Gene Kim — the first large-scale empirical study linking software delivery practices to organizational performance"
      ],
      [
        "2018",
        "Accelerate published, establishing the four DORA metrics as the canonical measure of software delivery performance and sparking the engineering effectiveness movement"
      ],
      [
        "2021",
        "SPACE framework published in ACM Queue, extending productivity measurement to include developer wellbeing and flow alongside pipeline metrics"
      ],
      [
        "2023",
        "McKinsey Developer Productivity report controversy prompts broad industry response; leading engineering orgs reaffirm multi-dimensional, team-level measurement over individual activity counting"
      ]
    ],
    "timeline_zh": [
      [
        "2014",
        "DORA 由 Nicole Forsgren、Jez Humble 和 Gene Kim 创立——首次大规模实证研究将软件交付实践与组织绩效相关联"
      ],
      [
        "2018",
        "「加速」出版，将四个 DORA 指标确立为软件交付性能的权威衡量标准，引发工程效能运动"
      ],
      [
        "2021",
        "SPACE 框架发表于 ACM Queue，将生产力测量扩展到包括开发者幸福感和心流，而不仅限于流水线指标"
      ],
      [
        "2023",
        "麦肯锡开发者生产力报告引发争议，促使业界广泛回应；领先工程组织重申多维度、团队层级的测量，而非个人活动计数"
      ]
    ],
    "dos": [
      "Do measure at the team level rather than the individual level — engineering effectiveness metrics are systems indicators, not performance scorecards for individuals",
      "Do combine quantitative pipeline metrics with qualitative developer surveys because neither alone gives the full picture of what is slowing teams down",
      "Do share effectiveness dashboards directly with engineering teams so they can self-diagnose and own improvements rather than waiting for leadership to act",
      "Do invest in the highest-leverage friction points first — slow CI and unreliable tests typically yield the largest productivity gains per dollar invested"
    ],
    "dos_zh": [
      "在团队层面而非个人层面进行测量——工程效能指标是系统性指标，而非针对个人的绩效评分卡",
      "将定量流水线指标与定性开发者调查相结合，因为两者单独都无法全面呈现团队减速的原因",
      "直接向工程团队共享效能仪表板，使其能够自我诊断并主导改进，而不是等待领导层行动",
      "优先投资最具杠杆效应的摩擦点——缓慢的 CI 和不可靠的测试通常能带来每投资元最大的生产力收益"
    ],
    "donts": [
      "Don't use DORA metrics or any effectiveness metrics as individual performance evaluation — it drives gaming, destroys psychological safety, and corrupts the measurement",
      "Don't track activity metrics (commits, PRs, lines of code) as proxies for productivity — they incentivize busywork and penalize high-impact, high-complexity changes",
      "Don't compare effectiveness metrics across teams without accounting for domain complexity, team tenure, and system maturity",
      "Don't launch an engineering effectiveness program without leadership commitment to act on the findings — measuring without improving erodes trust"
    ],
    "donts_zh": [
      "不要将 DORA 指标或任何效能指标用于个人绩效评估——这会导致博弈行为、破坏心理安全感并污染测量本身",
      "不要将活动指标（提交次数、PR 数量、代码行数）作为生产力的代理——它们激励无效忙碌，并惩罚高影响力、高复杂度的变更",
      "不要在不考虑领域复杂度、团队年限和系统成熟度的情况下跨团队比较效能指标",
      "不要在没有领导层承诺根据发现采取行动的情况下启动工程效能项目——测量而不改进会侵蚀信任"
    ],
    "case_study_company": "Shopify",
    "case_study": "Shopify's engineering effectiveness team, described in public engineering blog posts from 2020-2022, applied the SPACE framework to diagnose that their primary productivity bottleneck was not headcount or tooling complexity but CI pipeline reliability and duration. After investing in hermetic build environments and parallelized test execution, their median CI time dropped from 22 minutes to under 8 minutes. Developer satisfaction scores (measured quarterly) improved by 18 percentage points. The team tracked Deployment Frequency as a leading indicator and observed it increase from 40 to over 200 deploys per day across the organization within 18 months of the initiative — a direct outcome of reduced cycle time and increased developer confidence in fast feedback.",
    "case_study_zh": "Shopify 的工程效能团队在 2020-2022 年的公开工程博客中描述，他们运用 SPACE 框架诊断出主要生产力瓶颈不是人员不足或工具复杂度，而是 CI 流水线的可靠性和时长。在对密封构建环境和并行化测试执行进行投资后，中位 CI 时间从 22 分钟降至 8 分钟以内。季度测量的开发者满意度评分提升了 18 个百分点。团队将部署频率作为先导指标进行跟踪，在该计划启动后 18 个月内，组织每日部署次数从 40 次增加到 200 次以上——这是缩短周期时间和增强开发者对快速反馈信心的直接结果。",
    "when_not_to_use": [
      "Teams fewer than 10 engineers where the overhead of formal measurement programs exceeds the benefit and direct conversation is sufficient",
      "Short-lived project teams (under 3 months) where there is insufficient baseline data to trend meaningful improvements",
      "Organizations where metrics data is routinely used punitively — introducing measurement in low-trust environments causes gaming rather than improvement",
      "Research or innovation teams where exploration velocity is the goal and pipeline throughput metrics are irrelevant"
    ],
    "when_not_to_use_zh": [
      "少于 10 名工程师的团队，正式测量项目的开销超过其收益，直接沟通已足够",
      "短期项目团队（少于 3 个月），缺乏足够基准数据来追踪有意义的改进趋势",
      "指标数据惯常被用于惩罚性目的的组织——在低信任环境中引入测量会导致博弈而非改进",
      "以探索速度为目标的研究或创新团队，流水线吞吐量指标在此无关紧要"
    ],
    "adopters": [
      "Shopify",
      "Google",
      "LinkedIn",
      "Stripe",
      "Spotify"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability",
      "usability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Forsgren, N., Humble, J. & Kim, G. (2018). \"Accelerate: The Science of Lean Software and DevOps\". IT Revolution Press.",
    "secondary_sources": [
      "Forsgren, N. et al. (2021). \"The SPACE of Developer Productivity\". ACM Queue, Vol. 19.",
      "DORA (2023). \"State of DevOps Report 2023\". dora.dev.",
      "Greening, D. & Keane, S. (2021). \"Engineering Effectiveness at Shopify\". shopify.engineering."
    ],
    "typed_relations": [
      {
        "slug": "engineering-metrics-dashboard",
        "type": "complement"
      },
      {
        "slug": "developer-experience-framework",
        "type": "complement"
      },
      {
        "slug": "platform-engineering",
        "type": "complement"
      }
    ]
  },
  {
    "id": 281,
    "name": "Developer Onboarding Framework",
    "name_zh": "开发者入职框架",
    "slug": "developer-onboarding-framework",
    "category": "team",
    "desc": "Structured ramp-up program for new engineers combining role clarity, progressive autonomy, and embedded social integration",
    "desc_zh": "为新工程师设计的结构化成长计划，融合角色清晰度、渐进式自主权和融入团队社交",
    "steps": [
      "Create a written onboarding document (30/60/90-day plan) per role before the engineer joins, specifying expected milestones, assigned buddy, first task backlog, and system access checklist — so Day 1 is productive rather than administrative",
      "Assign a dedicated onboarding buddy (a peer, not the manager) who is available for questions without judgment for the first 30 days and whose own velocity is formally protected during this period",
      "Structure the first week as guided exploration: new hire ships a small, real change to production (a bug fix, a doc update, a minor feature) within 5 working days to build system confidence and establish commit access",
      "Introduce the codebase through context maps — architecture diagrams, ADRs, team-authored runbooks — before asking the engineer to read raw code, so they understand intent before implementation",
      "Run retrospective check-ins at 30, 60, and 90 days to identify gaps in tooling, documentation, or social integration, and feed findings back into the onboarding document for the next hire"
    ],
    "steps_zh": [
      "在工程师入职前，为每个角色创建书面入职文档（30/60/90 天计划），指定预期里程碑、指定伙伴、第一批任务积压和系统访问清单——确保第一天就能高效工作而非埋头行政事务",
      "指定专属入职伙伴（同级同事而非经理），在前 30 天随时可无顾虑地提问，且其自身的工作进度在此期间受到正式保护",
      "将第一周构建为引导式探索：新员工在入职 5 个工作日内将一个小型真实变更发布到生产环境（修复 bug、更新文档、小功能），以建立系统信心并完成提交权限配置",
      "通过上下文地图引导熟悉代码库——架构图、ADR、团队编写的操作手册——而非直接阅读原始代码，使其在理解意图后再看实现",
      "在 30、60 和 90 天进行回顾性检查，识别工具、文档或社交融入方面的差距，并将发现反馈回入职文档，供下一位新员工使用"
    ],
    "ai_relevant": false,
    "viz_type": "timeline",
    "viz_labels": [
      "30/60/90 Plan",
      "Onboarding Buddy",
      "First Commit",
      "Context Maps",
      "Check-in Retro"
    ],
    "viz_labels_zh": [
      "入职计划",
      "入职伙伴",
      "首次提交",
      "架构地图",
      "定期回顾"
    ],
    "related": [
      "engineering-ladder",
      "technical-mentorship-program",
      "developer-experience-framework",
      "rfc-process"
    ],
    "tags": [
      "onboarding",
      "new-hire",
      "ramp-up",
      "buddy-program",
      "developer-experience"
    ],
    "origin_author": "Kate Heddleston",
    "origin_source": "Heddleston, K. (2016). \"How to Onboard Software Engineers\". kateheddleston.com; Heddleston, K. (2018). \"Criticism and Ineffective Feedback\". kateheddleston.com.",
    "origin_source_zh": "Heddleston, K.（2016）「如何入职软件工程师」，kateheddleston.com；Heddleston, K.（2018）「批评与低效反馈」，kateheddleston.com",
    "complexity": "intermediate",
    "when_to_use": [
      "When the team is hiring more than 2-3 engineers per year and onboarding is informal, inconsistent, and dependent on whoever has bandwidth to help",
      "When new engineers consistently take 3+ months to make independent contributions, suggesting the path to productivity is unclear or blocked",
      "When attrition in the first 6 months is higher than expected and exit interviews reveal onboarding confusion or social isolation as contributing factors",
      "When the team is distributed across timezones and new hires cannot rely on informal corridor conversations to fill knowledge gaps"
    ],
    "when_to_use_zh": [
      "当团队每年招募 2-3 名以上工程师，而入职流程是非正式、不一致且依赖于恰好有时间的人帮助时",
      "当新工程师持续需要 3 个月以上才能独立做出贡献，表明通往生产力的路径不清晰或受阻时",
      "当入职前 6 个月的离职率高于预期，且离职访谈显示入职混乱或社交孤立是原因之一时",
      "当团队跨时区分布，新员工无法依靠非正式的走廊对话来填补知识空白时"
    ],
    "core_concepts": [
      "30/60/90-Day Plan: A role-specific roadmap defining what a new engineer should learn, build, and own at each milestone — transforming onboarding from a passive process into an active progression with clear success criteria",
      "Onboarding Buddy: A dedicated peer (distinct from the manager) who provides psychological safety for 'silly questions', navigates social dynamics, and offers unfiltered context about team culture — protected time for the buddy is essential",
      "First Contribution Fast: The practice of engineering onboarding so a new hire ships a real change to production within the first week — building system confidence, establishing toolchain access, and creating a sense of belonging",
      "Context Before Code: Presenting architectural intent (ADRs, diagrams, runbooks) before raw implementation because understanding why a system is designed as it is dramatically accelerates reading and modifying code correctly",
      "Onboarding as a Product: Treating the onboarding process as an internal product with owners, feedback loops, and continuous improvement cycles — the 90-day retrospective feeds directly into the next version of the onboarding doc"
    ],
    "core_concepts_zh": [
      "30/60/90 天计划：针对特定角色的路线图，定义新工程师在每个里程碑应学习、构建和负责的内容——将入职从被动过程转变为有明确成功标准的主动进阶",
      "入职伙伴：专属同级同事（有别于经理），提供提「傻问题」的心理安全感，引导社交动态，并提供关于团队文化的未过滤背景——伙伴的保护时间至关重要",
      "快速首次贡献：工程入职实践，新员工在入职第一周内将真实变更发布到生产环境——建立系统信心、完成工具链访问权限并产生归属感",
      "先理解背景再读代码：在阅读原始实现之前先呈现架构意图（ADR、图表、操作手册），因为理解系统如此设计的原因能显著加速正确阅读和修改代码",
      "将入职视为产品：将入职流程作为有负责人、反馈循环和持续改进周期的内部产品——90 天回顾直接反馈到下一版入职文档中"
    ],
    "timeline": [
      [
        "2009",
        "Google Project Oxygen research identifies manager behaviors that most correlate with team performance, including onboarding support — making structured ramp-up an evidence-backed practice"
      ],
      [
        "2016",
        "Kate Heddleston publishes influential essays on engineering onboarding, coining the 'brilliant jerk' culture problem and advocating for systematic, inclusive onboarding processes"
      ],
      [
        "2020",
        "Remote-first shift due to COVID-19 forces structured onboarding to become a written discipline as informal office-based knowledge transfer disappeared overnight"
      ],
      [
        "2023",
        "AI pair programming tools (GitHub Copilot, Cursor) begin to accelerate the 'first contribution' milestone but introduce new onboarding challenges around AI-assisted code review and quality standards"
      ]
    ],
    "timeline_zh": [
      [
        "2009",
        "Google Project Oxygen 研究识别出与团队绩效最相关的管理者行为，包括入职支持——使结构化成长成为有证据支持的实践"
      ],
      [
        "2016",
        "Kate Heddleston 发表关于工程入职的影响力文章，提出「优秀混蛋」文化问题，倡导系统性、包容性的入职流程"
      ],
      [
        "2020",
        "COVID-19 导致的远程优先转变迫使结构化入职成为书面规范，因为基于办公室的非正式知识传递一夜之间消失"
      ],
      [
        "2023",
        "AI 配对编程工具（GitHub Copilot、Cursor）开始加速「首次贡献」里程碑，但在 AI 辅助代码审查和质量标准方面引入了新的入职挑战"
      ]
    ],
    "dos": [
      "Do create the 30/60/90-day plan before the engineer joins, not during their first week, because improvising onboarding structure in real time signals disorganization and reduces new hire confidence",
      "Do protect the onboarding buddy's sprint capacity — if helping new hires competes with their own delivery goals, buddy relationships become resentful rather than supportive",
      "Do ensure the new hire ships something real to production within the first week because a working deployment end-to-end is the most effective way to verify that all toolchain access and processes are correctly configured",
      "Do treat onboarding documentation as a living product — run a retrospective after every new hire's 90 days and update the document with what was missing or confusing"
    ],
    "dos_zh": [
      "在工程师入职之前而非入职第一周创建 30/60/90 天计划，因为实时即兴构建入职结构传递出无序感，降低新员工信心",
      "保护入职伙伴的迭代容量——如果帮助新员工与其自身的交付目标相竞争，伙伴关系会变得充满怨恨而非支持性",
      "确保新员工在第一周内将真实内容发布到生产环境，因为端到端的完整部署是验证所有工具链访问权限和流程正确配置的最有效方式",
      "将入职文档视为活跃产品——每位新员工 90 天后进行回顾并更新缺失或令人困惑的内容"
    ],
    "donts": [
      "Don't assign onboarding as a manager's responsibility alone — managers lack time for deep technical mentoring and new hires need a peer-level relationship without the power dynamic",
      "Don't front-load the first week with presentations, company all-hands, and HR compliance training at the expense of getting the engineer into the actual development workflow",
      "Don't skip the architectural context stage by pointing new hires directly at the codebase — reading unfamiliar code without knowing the intent leads to cargo-cult modifications and embedded misunderstandings",
      "Don't treat onboarding as complete at 30 days — most engineers need 3-6 months to reach full independent contribution velocity, and abandoning structured support too early leaves them floundering"
    ],
    "donts_zh": [
      "不要将入职仅作为经理的责任——经理没有时间进行深度技术指导，新员工需要没有权力关系的同级关系",
      "不要用演示文稿、公司全体会议和 HR 合规培训填满第一周，以牺牲让工程师进入实际开发工作流为代价",
      "不要跳过架构背景阶段，直接让新员工阅读代码库——在不了解意图的情况下阅读陌生代码会导致照猫画虎式的修改和根深蒂固的误解",
      "不要在 30 天后就认为入职完成——大多数工程师需要 3-6 个月才能达到完全独立的贡献速度，过早放弃结构化支持会让他们陷入困境"
    ],
    "case_study_company": "Stripe",
    "case_study": "Stripe's engineering onboarding program, described in multiple public engineering talks (2018-2022), is regarded as one of the most effective in the industry. New engineers are given a curated 'getting started' codebase path, assigned a dedicated onboarding mentor for 60 days (separate from their team lead), and are expected to close a real customer-facing bug within their first five working days. The onboarding document is version-controlled in the same monorepo as the codebase. Internal data showed that engineers who completed the full structured onboarding program reached independent review-level contributions 40% faster than those who went through ad-hoc onboarding during a period of rapid growth. Stripe credits structured onboarding as a key factor in maintaining code quality standards as headcount scaled from 200 to 4000+ engineers.",
    "case_study_zh": "Stripe 的工程入职计划在多次公开工程演讲（2018-2022）中被描述为业内最有效的计划之一。新工程师获得精心策划的「入门」代码库路径，分配专属入职导师 60 天（独立于团队负责人），并被期望在入职后前五个工作日内解决一个真实的面向客户的 bug。入职文档与代码库一起进行版本控制，存储在同一 monorepo 中。内部数据显示，完成完整结构化入职计划的工程师，达到独立审查级别贡献的速度比经历临时入职的工程师快 40%，这一对比发生在快速增长时期。Stripe 将结构化入职列为在团队从 200 人扩展到 4000 名以上工程师时维持代码质量标准的关键因素。",
    "when_not_to_use": [
      "Micro-teams of fewer than 5 engineers where pair programming and direct collaboration serve as natural onboarding and formal programs add unnecessary overhead",
      "Short-term contractors or consultants engaged for a specific, bounded task who do not need deep system context",
      "Teams where the onboarding culture is intentionally steep as a self-selection mechanism — though this is rarely a justifiable design choice at scale",
      "Experimental projects with a lifespan under 6 months where the onboarding investment will not be recouped"
    ],
    "when_not_to_use_zh": [
      "少于 5 名工程师的微型团队，配对编程和直接协作可自然充当入职，正式项目增加了不必要的开销",
      "参与特定、有边界任务的短期承包商或顾问，不需要深度系统背景",
      "团队故意将入职流程设置得很陡峭以作为自我筛选机制的情况——尽管这在规模化时很少是合理的设计选择",
      "预期寿命不足 6 个月的实验性项目，入职投资无法得到回报"
    ],
    "adopters": [
      "Stripe",
      "Airbnb",
      "Shopify",
      "Netlify",
      "GitHub"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "usability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Heddleston, K. (2016). \"How to Onboard Software Engineers\". kateheddleston.com.",
    "secondary_sources": [
      "Klein, C. & Luciani, C. (2018). \"Onboarding at Stripe\". stripe.com/blog.",
      "Google re:Work (2019). \"Guide: Onboard New Employees Effectively\". rework.withgoogle.com.",
      "Larson, W. (2019). \"An Elegant Puzzle: Systems of Engineering Management\". Stripe Press."
    ],
    "typed_relations": [
      {
        "slug": "engineering-ladder",
        "type": "complement"
      },
      {
        "slug": "developer-experience-framework",
        "type": "complement"
      },
      {
        "slug": "guilds-communities-of-practice",
        "type": "related"
      }
    ]
  },
  {
    "id": 282,
    "name": "Technical Mentorship Program",
    "name_zh": "技术导师制计划",
    "slug": "technical-mentorship-program",
    "category": "team",
    "desc": "Structured mentoring relationships that accelerate engineering growth through deliberate skill pairing, learning goals, and accountability",
    "desc_zh": "通过刻意的技能配对、学习目标和问责机制，加速工程师成长的结构化导师关系",
    "steps": [
      "Define the mentorship program scope: distinguish between career mentorship (helping navigate organizational growth), technical mentorship (deepening domain expertise), and onboarding mentorship (ramp-up support) — each requires a different matching strategy",
      "Design the matching process: collect mentee learning goals and mentor expertise areas, then use structured matching (not volunteer-only pairing) to ensure high-leverage pairings across experience levels and technical domains",
      "Establish a lightweight operating cadence: bi-weekly 1:1s with an agenda template, a shared learning log, and a 90-day goal check-in — enough structure to prevent the relationship from drifting into casual chat, but light enough not to feel bureaucratic",
      "Provide mentors with facilitation training: teach active listening, growth-oriented feedback (specific, behavior-focused, actionable), and how to ask questions that develop the mentee's reasoning rather than delivering answers",
      "Measure program outcomes through mentor and mentee surveys at 6 months: track whether learning goals were achieved, whether the mentee feels their trajectory accelerated, and whether the mentor found the relationship rewarding — use results to refine matching and training"
    ],
    "steps_zh": [
      "定义导师计划范围：区分职业导师制（帮助导航组织成长）、技术导师制（深化领域专业知识）和入职导师制（成长支持）——每种都需要不同的配对策略",
      "设计配对流程：收集学员的学习目标和导师的专业领域，然后使用结构化配对（而非仅依靠自愿配对）确保跨经验级别和技术领域的高杠杆配对",
      "建立轻量级运营节奏：每两周一次的 1:1，配有议程模板、共享学习日志和 90 天目标检查——足够的结构防止关系演变为随意聊天，但又轻便得不会显得官僚",
      "为导师提供引导培训：教授积极倾听、以成长为导向的反馈（具体、以行为为中心、可操作），以及如何提问来培养学员的推理能力而非直接给出答案",
      "通过 6 个月时的导师和学员调查衡量项目成果：跟踪学习目标是否达成、学员是否感到自身轨迹加速，以及导师是否觉得这段关系有收获——用结果改进配对和培训"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Scope Define",
      "Matching Process",
      "Operating Cadence",
      "Mentor Training",
      "Outcome Survey"
    ],
    "viz_labels_zh": [
      "范围定义",
      "匹配机制",
      "运作节奏",
      "导师培训",
      "效果调查"
    ],
    "related": [
      "engineering-ladder",
      "developer-onboarding-framework",
      "guilds-communities-of-practice",
      "inner-source"
    ],
    "tags": [
      "mentorship",
      "career-development",
      "engineering-growth",
      "knowledge-transfer",
      "team-culture"
    ],
    "origin_author": "Camille Fournier",
    "origin_source": "Fournier, C. (2017). \"The Manager's Path\". O'Reilly Media; Heddleston, K. (2016). \"Onboarding and the Cost of Team Debt\". kateheddleston.com.",
    "origin_source_zh": "Fournier, C.（2017）「管理者之路」，O'Reilly Media；Heddleston, K.（2016）「入职与团队债务的代价」，kateheddleston.com",
    "complexity": "intermediate",
    "when_to_use": [
      "When the engineering ladder exists on paper but junior and mid-level engineers do not have a clear or supported path to the next level",
      "When senior engineers are leaving and taking institutional knowledge with them because there is no systematic knowledge transfer mechanism",
      "When diversity and inclusion initiatives are stalling because underrepresented engineers lack access to informal sponsorship and career guidance networks",
      "When the team has grown past 20 engineers and informal mentoring through proximity (adjacent desks, shared lunches) no longer reaches everyone"
    ],
    "when_to_use_zh": [
      "当工程晋升阶梯在纸面上存在，但初级和中级工程师没有清晰或有支撑的路径通向下一级时",
      "当高级工程师离职并带走机构知识，因为没有系统性知识传递机制时",
      "当多元化和包容性举措停滞不前，因为代表性不足的工程师缺乏获取非正式赞助和职业指导网络的渠道时",
      "当团队规模超过 20 名工程师，通过邻近性（相邻工位、共同午餐）进行的非正式指导不再覆盖所有人时"
    ],
    "core_concepts": [
      "Mentorship vs Sponsorship: Mentorship develops skills and perspective through advice and feedback; sponsorship actively advocates for the mentee in promotion discussions, high-visibility projects, and leadership conversations — both are needed for equitable career acceleration",
      "Deliberate Practice in Engineering: Mentorship is most effective when it targets specific, identified skill gaps (system design, code review quality, communication) rather than generic 'career advice' — the mentee grows faster when practice is intentional",
      "Mentor as Multiplier: The highest-leverage role for a senior engineer is developing junior engineers, not writing more code — a mentor who helps three engineers reach the next level has created more organizational value than any individual contribution",
      "Structured Cadence: Regular, calendar-blocked check-ins with a lightweight agenda prevent mentoring relationships from fading — the most common failure mode of informal mentorship is gradual drift into no contact",
      "Bidirectional Value: Well-designed mentorship programs create value for mentors too — they develop coaching and communication skills, gain exposure to fresh perspectives, and often report increased job satisfaction from contributing to others' growth"
    ],
    "core_concepts_zh": [
      "导师制与赞助的区别：导师制通过建议和反馈发展技能和视角；赞助制在晋升讨论、高曝光项目和领导层对话中主动为学员发声——两者都是实现公平职业加速所必需的",
      "工程中的刻意练习：当导师制针对特定的、已识别的技能差距（系统设计、代码审查质量、沟通）而非泛泛的「职业建议」时最为有效——当练习是有意识的时，学员成长更快",
      "导师作为倍增器：高级工程师最具杠杆效应的角色是培养初级工程师，而非编写更多代码——帮助三名工程师晋升到下一级的导师创造的组织价值超过任何个人贡献",
      "结构化节奏：定期、日历锁定的检查，配有轻量级议程，防止导师关系逐渐消亡——非正式导师制最常见的失败模式是逐渐漂移至失联",
      "双向价值：精心设计的导师制计划也为导师创造价值——他们发展辅导和沟通技能，获得接触新视角的机会，并且通常因为为他人成长做出贡献而报告更高的工作满意度"
    ],
    "timeline": [
      [
        "1990",
        "Formal mentoring programs gain traction in corporate engineering after studies show that informal mentoring disproportionately benefits majority-group employees, prompting structured alternatives"
      ],
      [
        "2017",
        "Camille Fournier's 'The Manager's Path' dedicates chapters to technical mentorship, establishing it as a first-class engineering leadership responsibility alongside architecture and hiring"
      ],
      [
        "2020",
        "Remote-first shift creates demand for asynchronous mentoring formats — video check-ins, shared docs, and Slack-based Q&A — making mentorship accessible regardless of timezone or office proximity"
      ],
      [
        "2024",
        "AI coding assistants begin changing the mentor-mentee dynamic: mentors increasingly focus on judgment, architecture decisions, and code review rather than syntax and API lookup, which AI handles fluently"
      ]
    ],
    "timeline_zh": [
      [
        "1990",
        "企业工程领域的正式导师计划获得牵引力，研究表明非正式导师制不成比例地惠及多数群体员工，促使结构化替代方案的出现"
      ],
      [
        "2017",
        "Camille Fournier 的「管理者之路」用多章节专门讨论技术导师制，将其确立为与架构和招聘并列的一流工程领导职责"
      ],
      [
        "2020",
        "远程优先的转变催生了对异步导师制格式的需求——视频检查、共享文档和 Slack 答疑——使导师制不受时区或办公室邻近性限制"
      ],
      [
        "2024",
        "AI 编程助手开始改变导师-学员动态：导师越来越专注于判断力、架构决策和代码审查，而非语法和 API 查找（AI 能流畅处理这些）"
      ]
    ],
    "dos": [
      "Do formalize matching rather than leaving it entirely to self-selection — informal mentorship networks naturally replicate existing social hierarchies and disadvantage engineers who lack access to senior relationships",
      "Do give mentors training in structured feedback techniques before pairing them — untrained mentors often default to advice-giving monologues rather than coaching conversations that build the mentee's own judgment",
      "Do establish explicit learning goals at the start of each mentorship pairing so both parties have a shared understanding of what success looks like at the 90-day and 6-month marks",
      "Do protect mentor time explicitly in sprint planning — if mentoring competes with delivery commitments, it will always lose"
    ],
    "dos_zh": [
      "正式化配对流程，而非完全依赖自我选择——非正式导师网络自然会复制现有的社会层级，不利于缺乏高级关系渠道的工程师",
      "在配对之前为导师提供结构化反馈技术培训——未经培训的导师往往默认为给建议的独白，而非建立学员自身判断力的辅导对话",
      "在每对导师配对开始时建立明确的学习目标，使双方对 90 天和 6 个月里程碑的成功标准有共同理解",
      "在迭代计划中明确保护导师时间——如果导师制与交付承诺相竞争，它将永远处于下风"
    ],
    "donts": [
      "Don't run a mentorship program where participation is mandatory for senior engineers but unrewarded — voluntary participation without recognition leads to resentful, low-quality mentoring",
      "Don't conflate mentorship with line management — mixing advice and accountability creates a dynamic where the mentee cannot be honest about struggles without fearing performance consequences",
      "Don't leave mentorship program governance to HR alone — the program must be co-owned by engineering leadership to ensure technical relevance, adequate time allocation, and organizational prioritization",
      "Don't define program success by participation rate alone — a high enrollment rate with low goal achievement is worse than a smaller program where mentees demonstrably level up"
    ],
    "donts_zh": [
      "不要运行高级工程师参与是强制的但没有奖励的导师计划——没有认可的自愿参与会导致充满怨恨的低质量指导",
      "不要将导师制与直线管理混为一谈——将建议和问责混合会产生学员无法诚实说出困难而不担心绩效后果的动态",
      "不要将导师计划治理完全留给 HR——该计划必须由工程领导层共同负责，以确保技术相关性、充足的时间分配和组织优先级",
      "不要仅以参与率来定义计划成功——高注册率但低目标达成率比规模较小但学员明显得到提升的计划更糟糕"
    ],
    "case_study_company": "Pinterest",
    "case_study": "Pinterest's engineering mentorship program, documented in their engineering blog (2019-2021), paired mid-level engineers with staff/principal engineers for 6-month structured engagements focused on system design and architectural thinking. The program used a formal matching questionnaire covering technical interests, growth goals, and working style preferences. Mentors received a facilitation guide with session structure templates and feedback frameworks. After the pilot cohort, Pinterest measured that participants promoted to the next engineering level at a 35% higher rate within 12 months compared to a control group. Mentors reported increased satisfaction with their role and cited the program as a retention factor. The program was later extended to include reverse mentorship (senior engineers learning from junior engineers on new technologies and user empathy).",
    "case_study_zh": "Pinterest 的工程导师计划在其工程博客（2019-2021）中有所记录，将中级工程师与高级/首席工程师配对，进行 6 个月结构化参与，专注于系统设计和架构思维。该计划使用正式的配对问卷，涵盖技术兴趣、成长目标和工作风格偏好。导师收到包含会议结构模板和反馈框架的引导指南。在试点队列之后，Pinterest 测量到参与者在 12 个月内晋升到下一工程级别的比率比对照组高 35%。导师报告对其角色的满意度提高，并将该计划列为留任因素。该计划后来扩展到包括反向导师制（高级工程师向初级工程师学习新技术和用户同理心）。",
    "when_not_to_use": [
      "Startups under 15 engineers where close daily collaboration provides natural mentoring and adding formal program overhead is premature",
      "Teams where senior engineers are already overwhelmed with delivery commitments and adding mentoring responsibility would cause burnout rather than growth",
      "Organizations without leadership commitment to protect mentor time — programs without time protection reliably fail within two quarters",
      "Short-duration teams (project-based, under 6 months) where the relationship investment horizon is too short for structured mentorship to yield returns"
    ],
    "when_not_to_use_zh": [
      "少于 15 名工程师的创业公司，紧密的日常协作提供了自然的指导，增加正式计划开销尚为时过早",
      "高级工程师已经被交付承诺压垮的团队，增加指导责任会导致倦怠而非成长",
      "没有领导层承诺保护导师时间的组织——没有时间保护的计划在两个季度内可靠地失败",
      "短期团队（基于项目，少于 6 个月），关系投资期太短，结构化导师制无法产生回报"
    ],
    "adopters": [
      "Pinterest",
      "Google",
      "Microsoft",
      "Airbnb",
      "Lyft"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "usability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Fournier, C. (2017). \"The Manager's Path: A Guide for Tech Leaders Navigating Growth and Change\". O'Reilly Media.",
    "secondary_sources": [
      "Heddleston, K. (2016). \"Onboarding and the Cost of Team Debt\". kateheddleston.com.",
      "Murphy, C. & Knutas, A. (2019). \"Engineering Mentorship Programs: Best Practices\". IEEE Software.",
      "Larson, W. (2019). \"An Elegant Puzzle: Systems of Engineering Management\". Stripe Press."
    ],
    "typed_relations": [
      {
        "slug": "engineering-ladder",
        "type": "complement"
      },
      {
        "slug": "guilds-communities-of-practice",
        "type": "complement"
      },
      {
        "slug": "developer-onboarding-framework",
        "type": "related"
      }
    ]
  },
  {
    "id": 283,
    "name": "Architecture Guild",
    "name_zh": "架构公会",
    "slug": "architecture-guild",
    "category": "team",
    "desc": "Cross-team community of practice for architecture alignment, decision review, and technical standard-setting without centralized command",
    "desc_zh": "跨团队架构对齐、决策评审和技术标准制定的实践社区，无需集中式指挥",
    "steps": [
      "Establish the guild charter: define scope (what architectural decisions require guild involvement), membership criteria (senior engineers and architects from each domain), meeting cadence (bi-weekly), and decision authority (advisory vs binding by decision type)",
      "Distinguish guild scope from the Architecture Review Board — guilds are voluntary communities of practice focused on ongoing alignment and knowledge sharing; an ARB is a formal governance gate; clearly defining the boundary prevents duplication and confusion",
      "Create a lightweight architectural decision intake process: teams propose cross-cutting decisions via a short RFC or ADR draft, the guild reviews asynchronously first, then discusses in the bi-weekly session — avoiding the bottleneck of routing every decision through a committee",
      "Maintain a shared architectural principles document (5-10 high-level principles that guide decisions, not prescribe solutions) that the guild owns and updates annually — making implicit team norms explicit and debatable",
      "Run periodic architecture landscape sessions where teams present their current system maps and upcoming decisions — building cross-team visibility, surfacing conflicts early, and creating learning opportunities for less-experienced engineers to observe how architectural thinking works"
    ],
    "steps_zh": [
      "建立公会章程：定义范围（哪些架构决策需要公会参与）、成员资格标准（每个领域的高级工程师和架构师）、会议节奏（每两周一次）以及决策权限（按决策类型区分顾问性或约束性）",
      "区分公会范围与架构评审委员会——公会是专注于持续对齐和知识共享的自愿实践社区；ARB 是正式的治理门禁；清晰定义边界防止重复和混乱",
      "创建轻量级架构决策接收流程：团队通过简短的 RFC 或 ADR 草稿提出跨切面决策，公会先异步审查，然后在双周会议中讨论——避免将每个决策路由通过委员会的瓶颈",
      "维护共享架构原则文档（5-10 条指导决策的高层原则，而非规定解决方案），公会负责并每年更新——使隐性团队规范显性化和可讨论",
      "定期举办架构全景会议，各团队展示其当前系统地图和即将到来的决策——建立跨团队可见性，及早发现冲突，并为经验较少的工程师创造观察架构思维如何运作的学习机会"
    ],
    "ai_relevant": false,
    "viz_type": "venn",
    "viz_labels": [
      "Guild",
      "Architecture Review Board",
      "ADR / RFC"
    ],
    "viz_labels_zh": [
      "技术公会",
      "架构评审委员会",
      "决策记录"
    ],
    "related": [
      "architecture-review-board",
      "guilds-communities-of-practice",
      "adr",
      "rfc-process"
    ],
    "tags": [
      "architecture",
      "guild",
      "community-of-practice",
      "alignment",
      "cross-team"
    ],
    "origin_author": "Spotify",
    "origin_source": "Kniberg, H. & Ivarsson, A. (2012). \"Scaling Agile @ Spotify\". Spotify Labs whitepaper; Skelton, M. & Pais, M. (2019). \"Team Topologies\". IT Revolution Press.",
    "origin_source_zh": "Kniberg, H. & Ivarsson, A.（2012）「Spotify 的规模化敏捷」，Spotify Labs 白皮书；Skelton, M. & Pais, M.（2019）「团队拓扑」，IT Revolution Press",
    "complexity": "intermediate",
    "when_to_use": [
      "When architectural decisions are being made independently by multiple teams without cross-team visibility, leading to duplication, divergence, or incompatible patterns",
      "When a formal Architecture Review Board exists but is perceived as a slow bureaucratic gate — a guild provides the alignment without the bottleneck",
      "When senior engineers with architectural expertise are distributed across teams and have no forum for cross-team learning and standard-setting",
      "When the organization is transitioning from monolith to microservices and needs a coordination mechanism for emerging distributed system patterns"
    ],
    "when_to_use_zh": [
      "当多个团队在没有跨团队可见性的情况下独立做出架构决策，导致重复、分歧或不兼容模式时",
      "当正式的架构评审委员会存在但被认为是缓慢的官僚门禁时——公会提供对齐而不引入瓶颈",
      "当具有架构专业知识的高级工程师分布在各团队，没有跨团队学习和标准制定的论坛时",
      "当组织正在从单体架构过渡到微服务，需要新兴分布式系统模式的协调机制时"
    ],
    "core_concepts": [
      "Community of Practice (CoP): A voluntary, cross-functional group united by shared craft interest rather than organizational reporting lines — guilds derive their authority from expertise and trust, not hierarchy",
      "Advisory vs Binding Authority: Architecture guilds work best when they have advisory authority for most decisions (recommendations teams can override with explicit justification) and binding authority only for foundational cross-cutting concerns (security standards, data contracts, API versioning policies)",
      "Architectural Drift: The gradual divergence of team-level architectural decisions from agreed patterns when there is no alignment forum — guilds prevent drift by creating a regular venue for comparing and reconciling approaches",
      "Enabling Constraint: Architectural principles set by a guild function as enabling constraints — they reduce the decision space to prevent locally-optimal but globally-incoherent choices, while still leaving teams freedom to solve problems creatively within the boundaries",
      "Distributed Architecture Ownership: The alternative to centralized architecture control — authority is distributed to teams, but coherence is maintained through shared principles, voluntary coordination, and mutual review rather than top-down mandates"
    ],
    "core_concepts_zh": [
      "实践社区（CoP）：由共同工艺兴趣而非组织汇报关系联合的自愿跨职能团体——公会从专业知识和信任而非层级获得权威",
      "顾问性与约束性权力：架构公会在对大多数决策具有顾问性权力时效果最佳（团队可以用明确理由推翻的建议），仅对基础性跨切面关注点（安全标准、数据合约、API 版本策略）具有约束性权力",
      "架构漂移：当没有对齐论坛时，团队层级架构决策与约定模式的逐渐偏离——公会通过创建定期比较和协调方法的场所来防止漂移",
      "赋能约束：公会设定的架构原则作为赋能约束发挥作用——它们缩小决策空间以防止局部最优但全局不连贯的选择，同时仍留给团队在边界内创造性解决问题的自由",
      "分布式架构所有权：集中式架构控制的替代方案——权力分配给团队，但通过共享原则、自愿协调和相互审查而非自上而下的指令来维持一致性"
    ],
    "timeline": [
      [
        "2012",
        "Spotify's internal engineering model, documented by Henrik Kniberg and Anders Ivarsson, introduces the guild as a formal organizational construct alongside squads, tribes, and chapters"
      ],
      [
        "2019",
        "Team Topologies (Skelton & Pais) formalizes the enabling team pattern, providing a systems-thinking lens for architecture guilds as cross-cutting enabling structures"
      ],
      [
        "2020",
        "Remote-first shift forces guilds to adapt to async-first formats — recorded architecture talks, async ADR reviews in GitHub, and virtual architecture landscape sessions become standard"
      ],
      [
        "2023",
        "Architecture guilds increasingly adopt AI-assisted decision support tools for searching prior ADRs, detecting pattern conflicts, and surfacing related decisions across the organization"
      ]
    ],
    "timeline_zh": [
      [
        "2012",
        "Henrik Kniberg 和 Anders Ivarsson 记录的 Spotify 内部工程模型将公会作为正式组织构建引入，与小队、部落和分会并列"
      ],
      [
        "2019",
        "「团队拓扑」（Skelton & Pais）将赋能团队模式正式化，为架构公会作为跨切面赋能结构提供系统思维视角"
      ],
      [
        "2020",
        "远程优先转变迫使公会适应异步优先格式——录制架构演讲、GitHub 中的异步 ADR 审查和虚拟架构全景会议成为标准"
      ],
      [
        "2023",
        "架构公会越来越多地采用 AI 辅助决策支持工具，用于搜索先前的 ADR、检测模式冲突，以及在组织内浮现相关决策"
      ]
    ],
    "dos": [
      "Do give the guild a clear charter that specifies which decisions require guild involvement and which are fully within team authority — ambiguity leads to either bypassing the guild or creating unnecessary bottlenecks",
      "Do keep guild membership permeable — any engineer with relevant expertise should be able to participate in a review, even if they are not a standing member, because this builds organizational alignment beyond the core group",
      "Do publish guild decisions and architectural principles openly (internal wiki, ADR repository) so teams can self-serve alignment without queuing up for the next guild meeting",
      "Do rotate the guild chair or facilitator role regularly to prevent it from becoming a de facto architecture authority concentrated in one person"
    ],
    "dos_zh": [
      "给公会一个明确的章程，规定哪些决策需要公会参与，哪些完全在团队权力范围内——模糊性会导致绕过公会或造成不必要的瓶颈",
      "保持公会成员资格的渗透性——任何具有相关专业知识的工程师都应该能够参与审查，即使他们不是常设成员，因为这能在核心团队之外建立组织对齐",
      "公开发布公会决策和架构原则（内部 wiki、ADR 存储库），使团队可以自助对齐而无需排队等待下一次公会会议",
      "定期轮换公会主席或主持人角色，防止其成为集中在一个人身上的事实架构权力"
    ],
    "donts": [
      "Don't create a guild that duplicates the Architecture Review Board — if both exist, define clearly which handles governance (ARB) and which handles learning and alignment (guild)",
      "Don't make guild attendance mandatory for engineers without adjusting their sprint capacity — treating the guild as just another meeting without protecting time for it signals that architectural alignment is not a real priority",
      "Don't let the guild become an approval bottleneck by requiring guild sign-off for every architectural decision — apply guild review proportionally to decision scope and reversibility",
      "Don't allow the guild to become a talking shop without outputs — every session should produce at least one artifact: an updated ADR, a revised principle, a shared decision log entry, or a recorded architecture talk"
    ],
    "donts_zh": [
      "不要创建一个与架构评审委员会重复的公会——如果两者都存在，清晰定义哪个处理治理（ARB）哪个处理学习和对齐（公会）",
      "不要在不调整工程师迭代容量的情况下强制要求参加公会——将公会视为另一个会议而不保护时间意味着架构对齐不是真正的优先事项",
      "不要通过要求所有架构决策都获得公会批准让公会成为审批瓶颈——按比例将公会审查应用于决策范围和可逆性",
      "不要让公会成为没有产出的清谈馆——每次会议都应产生至少一个产物：更新的 ADR、修订的原则、共享决策日志条目或录制的架构演讲"
    ],
    "case_study_company": "Spotify",
    "case_study": "Spotify's architecture guild, part of the squad/tribe/chapter/guild model documented in 2012, became one of the most widely referenced examples of distributed architecture governance. The guild operated as a voluntary community of senior engineers and architects across all tribes, meeting bi-weekly to review RFC proposals, share architectural learnings, and maintain Spotify's internal technology radar. Rather than approving or rejecting decisions, the guild's primary artifact was a living architectural principles document and a technology radar that tracked adopted, experimental, and deprecated patterns. Teams could make independent decisions but were expected to contribute learnings back to the guild. This model enabled Spotify to maintain coherent architecture across 100+ autonomous squads without a centralized architecture function, a balance that became a case study in the Team Topologies community.",
    "case_study_zh": "Spotify 的架构公会是 2012 年记录的小队/部落/分会/公会模型的一部分，成为分布式架构治理最广泛引用的案例之一。公会作为所有部落的高级工程师和架构师的自愿社区运作，每两周举行一次会议，审查 RFC 提案、分享架构经验，并维护 Spotify 内部技术雷达。公会不审批或拒绝决策，其主要产物是活跃的架构原则文档和跟踪已采纳、实验性和已废弃模式的技术雷达。团队可以独立做出决策，但被期望将经验反馈给公会。这一模型使 Spotify 能够在没有集中式架构职能的情况下在 100 多个自主小队之间维持连贯的架构，这一平衡成为团队拓扑社区的案例研究。",
    "when_not_to_use": [
      "Engineering organizations under 30 engineers where a single weekly architecture discussion in the all-hands is sufficient and a separate guild adds overhead",
      "Highly regulated industries where architecture decisions require formal traceability and approval — an ARB provides the accountability that a voluntary guild cannot",
      "Organizations where engineers lack psychological safety to challenge decisions in a cross-team forum — a guild without trust becomes a venue for political maneuvering rather than technical alignment",
      "Companies where the architectural approach is deliberately monolithic and all engineers work in the same codebase — guild coordination value only materializes in distributed or multi-team architectures"
    ],
    "when_not_to_use_zh": [
      "少于 30 名工程师的工程组织，全体会议中每周一次架构讨论已足够，单独的公会增加了开销",
      "架构决策需要正式可追溯性和审批的高度监管行业——ARB 提供了自愿公会无法保证的问责性",
      "工程师缺乏心理安全感在跨团队论坛中挑战决策的组织——没有信任的公会成为政治操纵而非技术对齐的场所",
      "刻意采用单体架构且所有工程师在同一代码库工作的公司——公会协调价值只在分布式或多团队架构中体现"
    ],
    "adopters": [
      "Spotify",
      "ING Bank",
      "Zalando",
      "ASOS",
      "Klarna"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "Kniberg, H. & Ivarsson, A. (2012). \"Scaling Agile @ Spotify\". Spotify Labs whitepaper. blog.crisp.se.",
    "secondary_sources": [
      "Skelton, M. & Pais, M. (2019). \"Team Topologies: Organizing Business and Technology Teams for Fast Flow\". IT Revolution Press.",
      "Hewitt, E. (2018). \"Technology Strategy Patterns\". O'Reilly Media.",
      "Ford, N. et al. (2022). \"Software Architecture: The Hard Parts\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "architecture-review-board",
        "type": "related"
      },
      {
        "slug": "guilds-communities-of-practice",
        "type": "complement"
      },
      {
        "slug": "adr",
        "type": "complement"
      }
    ]
  },
  {
    "id": 284,
    "name": "Decision Log Practice",
    "name_zh": "决策日志实践",
    "slug": "decision-log-practice",
    "category": "team",
    "desc": "Systematically recording, storing, and reviewing team decisions to create organizational memory, reduce repeated debates, and enable async alignment",
    "desc_zh": "系统性地记录、存储和回顾团队决策，创建组织记忆，减少重复争论，实现异步对齐",
    "steps": [
      "Choose a decision record format proportional to decision scope: lightweight Decision Log (title, context, decision, date, owner) for operational choices; full ADR (Architecture Decision Record) format for technical choices with significant downstream impact; formal RFC for cross-team proposals requiring broad input",
      "Establish a single canonical location for all decision records (a Git repository, a Notion database, or a Confluence space) with a consistent naming convention and indexed by date, team, and topic — accessibility and searchability are the primary value drivers",
      "Define a lightweight decision intake trigger: decisions that affect more than one team, decisions that will be hard to reverse, and decisions that establish a pattern others will follow are candidates for formal recording — not every choice needs a log entry",
      "Build decision review into the team rhythm: a quarterly 'decision archaeology' session reviews past decisions, identifies which assumptions have been invalidated, and flags records that should be updated, superseded, or deprecated",
      "Link decisions to downstream artifacts: ADRs linked in code comments near affected areas, RFCs linked in Jira epics, and decision records linked in post-incident reviews — creating a traceable chain from context to implementation"
    ],
    "steps_zh": [
      "选择与决策范围相称的记录格式：对操作性选择使用轻量级决策日志（标题、背景、决策、日期、负责人）；对具有重大下游影响的技术选择使用完整的 ADR（架构决策记录）格式；对需要广泛输入的跨团队提案使用正式 RFC",
      "为所有决策记录建立单一规范位置（Git 存储库、Notion 数据库或 Confluence 空间），采用一致的命名约定，按日期、团队和主题建立索引——可访问性和可搜索性是主要价值驱动因素",
      "定义轻量级决策接收触发器：影响多个团队的决策、难以撤销的决策，以及建立他人将遵循模式的决策是正式记录的候选项——不是每个选择都需要日志条目",
      "将决策审查纳入团队节奏：每季度「决策考古」会议回顾过去决策，识别哪些假设已被证伪，并标记应更新、取代或废弃的记录",
      "将决策与下游产物链接：ADR 链接在受影响区域附近的代码注释中，RFC 链接在 Jira 史诗中，决策记录链接在事后审查中——创建从背景到实现的可追溯链条"
    ],
    "ai_relevant": true,
    "viz_type": "timeline",
    "viz_labels": [
      "Decision Format",
      "Canonical Location",
      "Intake Trigger",
      "Quarterly Review",
      "Link Artifacts"
    ],
    "viz_labels_zh": [
      "记录格式",
      "统一位置",
      "触发条件",
      "季度回顾",
      "关联制品"
    ],
    "related": [
      "adr",
      "rfc-process",
      "architecture-guild",
      "blameless-postmortems"
    ],
    "tags": [
      "decision-records",
      "adr",
      "organizational-memory",
      "async-collaboration",
      "documentation"
    ],
    "origin_author": "Michael Nygard",
    "origin_source": "Nygard, M. (2011). \"Documenting Architecture Decisions\". cognitect.com/blog; Tyree, J. & Akerman, A. (2005). \"Architecture Decisions: Demystifying Architecture\". IEEE Software.",
    "origin_source_zh": "Nygard, M.（2011）「记录架构决策」，cognitect.com/blog；Tyree, J. & Akerman, A.（2005）「架构决策：揭开架构的神秘面纱」，IEEE Software",
    "complexity": "beginner",
    "when_to_use": [
      "When the team repeatedly reopens the same debates because there is no record of why a previous decision was made and what alternatives were considered",
      "When onboarding new engineers takes longer than expected because system design rationale is undocumented and must be extracted through interviews with long-tenured team members",
      "When post-incident reviews frequently surface 'we don't know why this was built this way' as a contributing factor",
      "When the team is distributed across timezones and asynchronous context-sharing is essential for effective collaboration"
    ],
    "when_to_use_zh": [
      "当团队反复重新开启相同的争论，因为没有关于以前决策原因及考虑过哪些替代方案的记录时",
      "当新工程师入职需要比预期更长的时间，因为系统设计理由未记录，必须通过与长期团队成员的访谈来提取时",
      "当事后审查频繁将「我们不知道为什么这样构建」列为原因之一时",
      "当团队跨时区分布，异步上下文共享对于有效协作至关重要时"
    ],
    "core_concepts": [
      "Architecture Decision Record (ADR): A short, structured document capturing a single architectural decision — the context that drove it, the decision itself, the alternatives considered, and the consequences expected — stored in the repository alongside the code it affects",
      "Organizational Memory: The accumulated institutional knowledge of why systems are built the way they are; without decision records, this knowledge lives only in the heads of long-tenured engineers and is lost when they leave",
      "Decision Traceability: The ability to trace from a current code or configuration artifact back to the decision that created it, the context that justified it, and the alternatives that were rejected — critical for change impact analysis and incident investigation",
      "Decision Hygiene: The practice of keeping the decision record current — marking decisions as superseded when circumstances change, adding outcome notes when initial assumptions are validated or refuted, and archiving rather than deleting deprecated decisions",
      "Reversibility Classification: Classifying decisions on a spectrum from fully reversible (cheap to change) to near-irreversible (prohibitively expensive to reverse) — high-reversibility decisions need less formal documentation; low-reversibility decisions warrant thorough ADRs"
    ],
    "core_concepts_zh": [
      "架构决策记录（ADR）：捕获单个架构决策的简短结构化文档——驱动它的背景、决策本身、考虑过的替代方案以及预期后果——与其影响的代码一起存储在存储库中",
      "组织记忆：关于系统为何以现有方式构建的积累性机构知识；没有决策记录，这些知识只存在于长期工程师的脑海中，当他们离开时就会丢失",
      "决策可追溯性：从当前代码或配置产物追溯到创建它的决策、证明它的背景以及被拒绝的替代方案的能力——对于变更影响分析和事件调查至关重要",
      "决策卫生：保持决策记录最新的实践——当情况变化时将决策标记为已取代，当初始假设被验证或反驳时添加结果说明，以及归档而非删除已废弃决策",
      "可逆性分类：将决策分类到从完全可逆（变更成本低）到近乎不可逆（撤销代价极高）的谱系上——高可逆性决策需要较少的正式文档；低可逆性决策值得详尽的 ADR"
    ],
    "timeline": [
      [
        "2005",
        "Tyree & Akerman publish 'Architecture Decisions: Demystifying Architecture' in IEEE Software, establishing the first structured framework for documenting architectural choices"
      ],
      [
        "2011",
        "Michael Nygard publishes 'Documenting Architecture Decisions' on the Cognitect blog, introducing the ADR format that becomes the de facto standard for lightweight decision capture"
      ],
      [
        "2016",
        "adr-tools command-line utility released by Nat Pryce, making ADR creation a one-command operation in Git repositories and driving widespread adoption in DevOps-oriented teams"
      ],
      [
        "2022",
        "MADR (Markdown Architectural Decision Records) and Y-Statements formats gain traction as simpler alternatives to full ADR, lowering the friction for teams new to decision logging"
      ]
    ],
    "timeline_zh": [
      [
        "2005",
        "Tyree & Akerman 在 IEEE Software 上发表「架构决策：揭开架构的神秘面纱」，建立了首个记录架构选择的结构化框架"
      ],
      [
        "2011",
        "Michael Nygard 在 Cognitect 博客上发表「记录架构决策」，引入了成为轻量级决策捕获事实标准的 ADR 格式"
      ],
      [
        "2016",
        "Nat Pryce 发布 adr-tools 命令行工具，使 ADR 创建成为 Git 存储库中的单命令操作，推动了面向 DevOps 团队的广泛采用"
      ],
      [
        "2022",
        "MADR（Markdown 架构决策记录）和 Y-Statements 格式作为完整 ADR 的更简单替代方案获得牵引力，降低了决策日志记录新团队的摩擦"
      ]
    ],
    "dos": [
      "Do store ADRs in the same Git repository as the code they relate to because co-location creates discoverability through search, enables PR-linked review, and ensures decisions are versioned alongside the artifacts they govern",
      "Do write decision records at the time of the decision, not retrospectively — retrospective ADRs are often incomplete because the full context is lost, and they are frequently never written at all",
      "Do include the alternatives considered and why they were rejected in every ADR because the rejected options are often as instructive as the chosen one — future engineers need to understand why option B was not selected before re-proposing it",
      "Do define explicit triggers for when a new decision record is required — the worst decision logging culture is one where some teams write ADRs for everything and others write none, resulting in inconsistent organizational memory"
    ],
    "dos_zh": [
      "将 ADR 存储在与其相关代码相同的 Git 存储库中，因为共同存储通过搜索创造可发现性，支持 PR 链接审查，并确保决策与其管理的产物一起版本化",
      "在决策时而非事后回溯性地编写决策记录——回溯性 ADR 通常不完整，因为完整背景已丢失，而且它们往往根本不会被写出",
      "在每个 ADR 中包含考虑过的替代方案及其被拒绝的原因，因为被拒绝的选项通常与被选择的选项一样具有启发性——未来工程师在重新提出方案 B 之前需要了解它为何未被选择",
      "定义何时需要新决策记录的明确触发器——最糟糕的决策日志文化是某些团队为所有事情编写 ADR，而其他团队一个也不写，导致组织记忆不一致"
    ],
    "donts": [
      "Don't write ADRs that describe what was decided without explaining why — the 'why' and 'alternatives considered' sections are the primary value of the document; a decision without context is just a policy statement",
      "Don't require lengthy ADRs for operational decisions — a 5-paragraph ADR for a variable naming convention or tool configuration choice adds friction without value; reserve the format for genuinely consequential decisions",
      "Don't let decision records become an approval process — ADRs are documentation artifacts, not sign-off gates; coupling decision logging to formal approval slows decision-making and discourages teams from logging smaller decisions",
      "Don't neglect decision hygiene — a repository of outdated, superseded ADRs that are never marked as deprecated is worse than no decision log because it actively misleads engineers about the current state of the system"
    ],
    "donts_zh": [
      "不要编写只描述决策内容而不解释原因的 ADR——「原因」和「考虑过的替代方案」部分是文档的主要价值；没有背景的决策只是政策声明",
      "不要为操作性决策要求冗长的 ADR——为变量命名约定或工具配置选择编写 5 段 ADR 会增加摩擦而无价值；将格式保留给真正有影响力的决策",
      "不要让决策记录成为审批流程——ADR 是文档产物，不是签字门禁；将决策记录与正式审批耦合会减慢决策速度，并阻止团队记录较小的决策",
      "不要忽视决策卫生——一个充满过时、已取代但从未标记为废弃的 ADR 的存储库比没有决策日志更糟糕，因为它会主动误导工程师了解系统当前状态"
    ],
    "case_study_company": "Spotify",
    "case_study": "Spotify's engineering organization adopted ADR-based decision logging as part of their architecture guild practice. Each squad was expected to log architectural decisions in a shared GitHub repository using a lightweight ADR template that captured context, decision, status, and consequences. The guild maintained a searchable index of all ADRs across squads, enabling cross-team pattern recognition — when multiple squads independently converged on similar database caching approaches, the guild surfaced the pattern, created a shared ADR, and established it as a recommended practice. During a major platform migration (from monolith to microservices, 2014-2016), decision logs were credited in retrospectives as a key factor in the migration's coherence — engineers joining midway could reconstruct the reasoning behind design choices without interviewing the original authors.",
    "case_study_zh": "Spotify 的工程组织将基于 ADR 的决策记录作为其架构公会实践的一部分采用。每个小队都被期望使用轻量级 ADR 模板在共享 GitHub 存储库中记录架构决策，捕获背景、决策、状态和后果。公会维护跨小队所有 ADR 的可搜索索引，实现跨团队模式识别——当多个小队独立汇聚到相似的数据库缓存方案时，公会浮现了这一模式，创建了共享 ADR，并将其确立为推荐实践。在一次重大平台迁移（从单体架构到微服务，2014-2016）期间，决策日志在回顾中被誉为迁移连贯性的关键因素——中途加入的工程师可以在不采访原始作者的情况下重建设计选择背后的推理。",
    "when_not_to_use": [
      "Startups in the first 6 months where the system is small enough to be understood by everyone on the team without written records",
      "Research or prototype projects where decisions are intentionally provisional and the system will be rebuilt or discarded — spending time on decision records for throwaway code is waste",
      "Teams where decision-making is so centralized that only one person makes architectural decisions — decision logging adds value when it distributes context to a team, not when it documents a single person's choices",
      "Organizations where the primary bottleneck is decision velocity, not decision quality — adding logging requirements to already-slow decision processes will worsen the bottleneck before improving it"
    ],
    "when_not_to_use_zh": [
      "前 6 个月的创业公司，系统足够小，团队中的每个人都无需书面记录就能理解",
      "决策有意是临时性的且系统将被重建或丢弃的研究或原型项目——为一次性代码花时间编写决策记录是浪费",
      "决策制定如此集中以至于只有一个人做出架构决策的团队——当决策记录将背景分发给团队时才有价值，而非只是记录一个人的选择",
      "主要瓶颈是决策速度而非决策质量的组织——在已经缓慢的决策流程中增加记录要求会在改善之前加剧瓶颈"
    ],
    "adopters": [
      "Spotify",
      "Zalando",
      "GitHub",
      "ThoughtWorks",
      "Netflix"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "maintainability",
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Nygard, M. (2011). \"Documenting Architecture Decisions\". cognitect.com/blog.",
    "secondary_sources": [
      "Tyree, J. & Akerman, A. (2005). \"Architecture Decisions: Demystifying Architecture\". IEEE Software, Vol. 22, No. 2.",
      "Pryce, N. (2016). \"adr-tools\". github.com/npryce/adr-tools.",
      "Hohpe, G. (2021). \"The Software Architect Elevator\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "adr",
        "type": "complement"
      },
      {
        "slug": "rfc-process",
        "type": "complement"
      },
      {
        "slug": "architecture-review-board",
        "type": "related"
      }
    ]
  },
  {
    "id": 312,
    "name": "Engineering Principles Framework",
    "name_zh": "工程原则框架",
    "slug": "engineering-principles-framework",
    "category": "team",
    "desc": "A structured practice of defining, publishing, and evolving explicit engineering principles that guide daily technical decisions across an organization",
    "desc_zh": "定义、发布和演进明确工程原则的结构化实践，引导组织内日常技术决策",
    "steps": [
      "Gather input across seniority levels: run structured workshops where engineers at all levels articulate the implicit heuristics they already apply when making technical choices — capture these as candidate principles in raw form",
      "Synthesize and deduplicate: group similar heuristics, identify tensions between them, and draft 8-15 concise principles with a name, one-sentence statement, rationale paragraph, and two example applications each",
      "Ratify through deliberation: circulate the draft to all engineering teams for a 2-week review, resolve conflicts through async RFC discussion, then publish the v1 doc in a prominent location (engineering handbook, internal site)",
      "Embed principles in decision artifacts: require ADRs, tech specs, and code review comments to cite relevant principles by name, creating a living record of how principles shape real decisions",
      "Review and evolve annually: schedule a yearly retrospective to assess whether each principle is helping or hindering; retire stale ones, add new ones that capture emerging consensus, and publish a changelog"
    ],
    "steps_zh": [
      "跨级别收集意见：开展结构化工作坊，让各级工程师阐述他们在做技术选择时已经应用的隐性启发法——以原始形式收集这些内容作为候选原则",
      "整合与去重：对相似启发法进行分组，识别它们之间的张力，起草8-15条简洁原则，每条包含名称、单句陈述、理由段落和两个应用示例",
      "通过审议批准：将草稿发给所有工程团队进行2周审查，通过异步RFC讨论解决冲突，然后在显著位置（工程手册、内部网站）发布v1文档",
      "在决策产物中嵌入原则：要求ADR、技术规范和代码审查评论按名称引用相关原则，创建原则如何影响实际决策的活体记录",
      "每年回顾和演进：安排年度回顾评估每条原则是否有助或有碍；淘汰过时的，添加体现新兴共识的新原则，并发布变更日志"
    ],
    "ai_relevant": false,
    "viz_type": "pyramid",
    "viz_labels": [
      "Gather Input",
      "Synthesize",
      "Ratify",
      "Embed in Decisions",
      "Annual Review"
    ],
    "viz_labels_zh": [
      "收集输入",
      "综合提炼",
      "批准发布",
      "决策嵌入",
      "年度评审"
    ],
    "related": [
      "adr",
      "rfc-process",
      "engineering-ladder",
      "decision-log-practice",
      "architecture-review-board"
    ],
    "tags": [
      "engineering-culture",
      "principles",
      "decision-making",
      "team",
      "standards"
    ],
    "origin_author": "Various; formalized by Stripe, Netflix, and Basecamp engineering blogs (2013-2018)",
    "origin_year": 2013,
    "origin_source": "Stripe Engineering (2013). \"How we think about software development\". stripe.com/blog. Netflix Technology Blog (2016). \"Netflix Engineering Values\".",
    "origin_source_zh": "Stripe工程（2013）。《我们如何思考软件开发》。stripe.com/blog。Netflix技术博客（2016）。《Netflix工程价值观》。",
    "complexity": "beginner",
    "abstraction_level": "organization",
    "maturity_ring": "foundational",
    "quality_concerns": [
      "maintainability"
    ],
    "adopters": [
      "Stripe",
      "Netflix",
      "Basecamp",
      "Spotify",
      "Monzo"
    ],
    "when_to_use": [
      "Engineering organizations scaling past 50 engineers where informal cultural transmission of technical values becomes unreliable",
      "Post-merger or multi-office teams that need to establish shared technical norms across diverse engineering cultures",
      "Organizations experiencing recurring architectural inconsistencies or technology sprawl that indicate absence of agreed decision criteria",
      "Teams introducing engineering managers from non-technical backgrounds who need explicit codified guidance on what good engineering looks like"
    ],
    "when_to_use_zh": [
      "工程组织规模超过50人后，非正式的技术价值文化传播变得不可靠",
      "需要跨不同工程文化建立共同技术规范的合并后或多地团队",
      "经历反复出现的架构不一致或技术扩散的组织，表明缺乏商定的决策标准",
      "引入非技术背景工程经理的团队，需要明确的编码指导来说明好的工程是什么样的"
    ],
    "core_concepts": [
      "Principle vs. Rule: Principles are durable heuristics that guide judgment in novel situations; rules are binary mandates. Good engineering principles leave room for context-sensitive application",
      "Explicit Over Implicit: Writing down the values that experienced engineers already hold makes them transferable, debatable, and evolvable — rather than locked in the heads of founding engineers",
      "Rationale as First-Class Content: Every principle must carry its 'why' — the failure mode it prevents or the value it promotes — so engineers can apply it correctly in unforeseen situations",
      "Principle Citation in Artifacts: Engineering artifacts (ADRs, tech specs, PRs) that reference principles by name create an auditable trail of how stated values drive actual decisions",
      "Living Document: Principles drift from reality if not revisited; an annual cadence with a public changelog signals that the organization takes the document seriously rather than treating it as shelfware"
    ],
    "core_concepts_zh": [
      "原则与规则的区别：原则是指导在新情境中做判断的持久启发法；规则是二元指令。好的工程原则为情境敏感的应用留有空间",
      "显式优于隐式：写下有经验的工程师已经持有的价值观使其可传递、可辩论和可演进——而不是锁定在创始工程师的头脑中",
      "理由作为一等内容：每条原则必须承载其「为什么」——它防止的失败模式或它促进的价值——以便工程师在未预见的情况下正确应用它",
      "在产物中引用原则：按名称引用原则的工程产物（ADR、技术规范、PR）创建了说明价值观如何驱动实际决策的可审计追踪",
      "活体文档：如果不定期重新审视，原则会与现实偏离；带有公开变更日志的年度周期表明组织认真对待该文档而非将其束之高阁"
    ],
    "timeline": [
      [
        2013,
        "Stripe publishes its first internal engineering principles doc; other fast-growing startups follow the pattern"
      ],
      [
        2016,
        "Netflix engineering blog formalizes 'Freedom and Responsibility' as codified engineering values; widely referenced"
      ],
      [
        2019,
        "Monzo publishes its engineering principles openly; the open-sourcing of principles docs becomes a recruiting signal"
      ],
      [
        2022,
        "Platform engineering movement drives renewed interest in explicit principles as governance mechanism for internal platforms"
      ]
    ],
    "timeline_zh": [
      [
        2013,
        "Stripe发布其第一份内部工程原则文档；其他快速增长的初创公司跟随这一模式"
      ],
      [
        2016,
        "Netflix工程博客将「自由与责任」正式化为编码的工程价值观；被广泛参考"
      ],
      [
        2019,
        "Monzo公开发布其工程原则；开源原则文档成为招聘信号"
      ],
      [
        2022,
        "平台工程运动推动了对明确原则作为内部平台治理机制的新兴趣"
      ]
    ],
    "dos": [
      "Do involve engineers at all levels in principle authorship — principles authored exclusively by VPs will be ignored by ICs who had no voice in shaping them",
      "Do keep the list short (under 15 principles) so the full set is memorizable and each principle carries weight",
      "Do include concrete examples for each principle that show both correct and incorrect application in realistic scenarios",
      "Do track which principles are most frequently cited in ADRs as a leading indicator of which ones are actually influencing decisions"
    ],
    "dos_zh": [
      "让各级工程师参与原则的撰写——只由VP撰写的原则会被没有发言权的IC忽视",
      "保持列表简短（不超过15条原则），使全套原则可记忆，且每条原则具有分量",
      "为每条原则包含具体示例，展示在现实场景中正确和错误的应用",
      "追踪哪些原则在ADR中被最频繁引用，作为哪些原则实际影响决策的领先指标"
    ],
    "donts": [
      "Don't write principles as aspirational slogans without rationale — \"Move fast\" or \"Be excellent\" gives engineers no guidance when two values conflict",
      "Don't treat principles as immutable — an organization that never updates its principles likely stopped believing in them long ago",
      "Don't create principles by committee vote — authority-by-consensus produces anodyne statements that offend no one and guide no one",
      "Don't confuse principles with coding standards — linting rules and style guides are separate artifacts that operate at a different level of abstraction"
    ],
    "donts_zh": [
      "不要将原则写成没有理由的励志口号——「快速行动」或「追求卓越」在两个价值观冲突时不给工程师任何指导",
      "不要将原则视为不可变的——一个从不更新原则的组织很可能早已停止相信它们",
      "不要通过委员会投票制定原则——共识权威会产生不冒犯任何人也不指导任何人的平淡陈述",
      "不要将原则与编码标准混淆——代码检查规则和风格指南是在不同抽象层次运作的独立产物"
    ],
    "case_study_company": "Monzo",
    "case_study": "Monzo published their engineering principles openly on their blog in 2019 with a set of 9 principles including 'We build for reliability over convenience' and 'We default to transparency'. Each principle included explicit anti-patterns and historical examples from Monzo's own codebase. Within 18 months of publication, internal surveys showed that 78% of engineers could recall at least 5 principles unprompted, and code review culture shifted measurably toward citing principles when requesting changes rather than invoking personal preference.",
    "case_study_zh": "Monzo于2019年在其博客上公开发布了工程原则，包含9条原则，例如「我们为可靠性而非便利性而构建」和「我们默认透明」。每条原则都包含明确的反模式和来自Monzo自身代码库的历史示例。发布后18个月内，内部调查显示78%的工程师能够在不提示的情况下回忆至少5条原则，且代码审查文化在请求更改时明显转向引用原则而非援引个人偏好。",
    "when_not_to_use": [
      "Startups with fewer than 20 engineers where shared context is maintained through direct communication and principles can feel bureaucratic",
      "Crisis situations requiring immediate decisive action — consulting principles during an incident is a symptom of insufficient runbooks, not a substitute",
      "Teams with extremely high turnover where investing in principle documentation yields diminishing returns relative to direct onboarding",
      "Organizations where leadership regularly overrides engineering decisions regardless of stated principles — publishing principles in a low-trust environment creates cynicism"
    ],
    "when_not_to_use_zh": [
      "工程师少于20人的初创公司，共同上下文通过直接沟通维护，原则可能感觉官僚",
      "需要立即果断行动的危机情况——在事故期间查阅原则是运行手册不足的症状，而非替代品",
      "人员流动率极高的团队，投资于原则文档相对于直接入职培训的收益递减",
      "领导层无论声明的原则如何都定期推翻工程决策的组织——在低信任环境中发布原则会产生愤世嫉俗"
    ],
    "primary_source": "Stripe Engineering (2013). \"How we think about software development\". stripe.com/blog",
    "primary_source_zh": "Stripe工程（2013）。《我们如何思考软件开发》。stripe.com/blog",
    "secondary_sources": [
      "Monzo Engineering (2019). \"Our Engineering Principles\". monzo.com/blog/2019/09/engineering-principles",
      "Netflix Technology Blog (2016). \"Netflix Engineering Values\". netflixtechblog.com"
    ],
    "secondary_sources_zh": [
      "Monzo工程（2019）。《我们的工程原则》。monzo.com/blog/2019/09/engineering-principles",
      "Netflix技术博客（2016）。《Netflix工程价值观》。netflixtechblog.com"
    ]
  },
  {
    "id": 151,
    "name": "OpenTelemetry",
    "name_zh": "OpenTelemetry统一可观测性标准",
    "slug": "opentelemetry",
    "category": "observability",
    "desc": "Unified observability standard for traces, metrics, and logs across services",
    "desc_zh": "跨服务的追踪、指标和日志统一可观测性标准",
    "steps": [
      "Instrument applications with OpenTelemetry SDK: add auto-instrumentation agents or manual spans to capture traces, metrics, and logs at service boundaries",
      "Define a consistent resource model: tag every telemetry signal with service.name, service.version, deployment.environment, and other semantic conventions",
      "Configure exporters to send telemetry data to your chosen backend (Jaeger, Prometheus, Grafana, Datadog) via OTLP protocol",
      "Deploy the OpenTelemetry Collector as a central pipeline: receive, process (batch, filter, enrich), and export telemetry data with vendor-agnostic routing",
      "Correlate the three signals: link trace IDs in logs, attach exemplars to metrics, and build dashboards that let you pivot from a metric anomaly to the exact trace and log"
    ],
    "steps_zh": [
      "使用OpenTelemetry SDK对应用进行埋点：添加自动埋点代理或手动Span，在服务边界捕获追踪、指标和日志",
      "定义一致的资源模型：为每个遥测信号打上service.name、service.version、deployment.environment等语义约定标签",
      "配置导出器，通过OTLP协议将遥测数据发送到所选后端（Jaeger、Prometheus、Grafana、Datadog）",
      "部署OpenTelemetry Collector作为中心管道：以厂商无关的路由方式接收、处理（批处理、过滤、丰富）和导出遥测数据",
      "关联三大信号：在日志中嵌入Trace ID，为指标附加Exemplar，构建仪表盘实现从指标异常到精确追踪和日志的跳转"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Instrument",
      "Resource Model",
      "Exporters",
      "OTel Collector",
      "Correlate Signals"
    ],
    "viz_labels_zh": [
      "埋点",
      "资源模型",
      "导出器",
      "收集器",
      "信号关联"
    ],
    "related": [
      "distributed-tracing",
      "structured-logging",
      "four-golden-signals",
      "use-method",
      "red-method",
      "sli-slo-sla"
    ],
    "tags": [
      "observability",
      "opentelemetry",
      "otel",
      "traces",
      "metrics",
      "logs",
      "cncf"
    ],
    "origin_author": "CNCF (merger of OpenTracing & OpenCensus), 2019",
    "origin_source": "Observability Engineering (Charity Majors, Liz Fong-Jones, George Miranda, O'Reilly, 2022)",
    "origin_source_zh": "《可观测性工程》（Charity Majors、Liz Fong-Jones、George Miranda，O'Reilly，2022）",
    "complexity": "intermediate",
    "when_to_use": [
      "Building or migrating to microservices where cross-service visibility is essential for debugging",
      "Organizations wanting vendor-neutral instrumentation to avoid lock-in to a single observability backend",
      "Teams consolidating fragmented telemetry (separate tracing, metrics, and logging libraries) into one standard",
      "Cloud-native environments on Kubernetes where auto-instrumentation and sidecar collectors reduce manual effort"
    ],
    "when_to_use_zh": [
      "构建或迁移微服务架构时，跨服务可见性对调试至关重要",
      "希望采用厂商中立的埋点方案以避免被单一可观测性后端锁定的组织",
      "将碎片化遥测（独立的追踪、指标和日志库）整合为统一标准的团队",
      "Kubernetes上的云原生环境中，自动埋点和Sidecar采集器可减少手工工作"
    ],
    "core_concepts": [
      "Three Pillars Unification: OpenTelemetry provides a single SDK and API for traces, metrics, and logs, eliminating the need for separate instrumentation libraries",
      "Semantic Conventions: Standardized attribute naming (http.method, db.system, rpc.service) ensures consistent telemetry across languages and frameworks",
      "OTLP (OpenTelemetry Protocol): A vendor-neutral wire protocol for transmitting telemetry data efficiently between applications, collectors, and backends",
      "Collector Pipeline: A standalone agent/gateway that receives, processes, and exports telemetry with pluggable receivers, processors, and exporters",
      "Context Propagation: Automatically passes trace context (trace ID, span ID) across service boundaries via HTTP headers (W3C Trace Context) or messaging metadata"
    ],
    "core_concepts_zh": [
      "三大信号统一：OpenTelemetry为追踪、指标和日志提供单一SDK和API，消除对独立埋点库的需求",
      "语义约定：标准化的属性命名（http.method、db.system、rpc.service）确保跨语言和框架的遥测一致性",
      "OTLP（OpenTelemetry协议）：厂商中立的传输协议，在应用、采集器和后端之间高效传输遥测数据",
      "Collector管道：独立的代理/网关，通过可插拔的接收器、处理器和导出器接收、处理和导出遥测数据",
      "上下文传播：通过HTTP头（W3C Trace Context）或消息元数据自动在服务边界间传递追踪上下文（Trace ID、Span ID）"
    ],
    "timeline": [
      [
        "2010",
        "Google publishes the Dapper paper, establishing foundational concepts for distributed tracing"
      ],
      [
        "2016",
        "OpenTracing (CNCF) and OpenCensus (Google) emerge as competing open-source instrumentation standards"
      ],
      [
        "2019",
        "OpenTracing and OpenCensus merge to form OpenTelemetry under CNCF governance"
      ],
      [
        "2021",
        "OpenTelemetry tracing specification reaches 1.0 stability; adopted by major cloud providers"
      ],
      [
        "2023",
        "OpenTelemetry becomes the second most active CNCF project after Kubernetes; log signal reaches stability"
      ]
    ],
    "timeline_zh": [
      [
        "2010",
        "Google发表Dapper论文，奠定分布式追踪的基础概念"
      ],
      [
        "2016",
        "OpenTracing（CNCF）和OpenCensus（Google）作为竞争性开源埋点标准出现"
      ],
      [
        "2019",
        "OpenTracing和OpenCensus在CNCF治理下合并为OpenTelemetry"
      ],
      [
        "2021",
        "OpenTelemetry追踪规范达到1.0稳定版；被主要云提供商采用"
      ],
      [
        "2023",
        "OpenTelemetry成为仅次于Kubernetes的第二活跃CNCF项目；日志信号达到稳定"
      ]
    ],
    "dos": [
      "Do adopt auto-instrumentation first for quick wins, then add manual spans for business-critical paths that need richer context",
      "Do use the OpenTelemetry Collector instead of sending directly from apps to backends, because it decouples your application from the export destination",
      "Do follow semantic conventions strictly, because inconsistent attribute names make cross-service queries unreliable",
      "Do implement sampling strategies (head-based or tail-based) to control costs while retaining interesting traces"
    ],
    "dos_zh": [
      "优先采用自动埋点以快速见效，然后为需要丰富上下文的业务关键路径添加手动Span",
      "使用OpenTelemetry Collector而非直接从应用发送到后端，因为它将应用与导出目标解耦",
      "严格遵循语义约定，因为不一致的属性命名会使跨服务查询不可靠",
      "实施采样策略（头部采样或尾部采样）以控制成本同时保留有价值的追踪"
    ],
    "donts": [
      "Don't instrument everything at maximum detail from day one, because the telemetry volume and cost will be unmanageable",
      "Don't mix OpenTelemetry with legacy tracing libraries (Zipkin client, Jaeger client) in the same service, because conflicting context propagation causes broken traces",
      "Don't skip resource attribute configuration, because telemetry without service identity is impossible to filter or route",
      "Don't ignore collector health monitoring, because a failing collector silently drops all telemetry data"
    ],
    "donts_zh": [
      "不要从第一天就以最大粒度对所有内容埋点，因为遥测数据量和成本将难以管理",
      "不要在同一服务中混用OpenTelemetry和遗留追踪库（Zipkin客户端、Jaeger客户端），因为冲突的上下文传播会导致追踪链断裂",
      "不要跳过资源属性配置，因为没有服务标识的遥测数据无法过滤或路由",
      "不要忽视Collector健康监控，因为故障的Collector会静默丢弃所有遥测数据"
    ],
    "case_study_company": "Shopify",
    "case_study": "Shopify migrated from a patchwork of Datadog APM, StatsD, and custom tracing to OpenTelemetry across their Ruby on Rails monolith and surrounding microservices. By standardizing on OTel's auto-instrumentation and deploying a fleet of OTel Collectors, they reduced instrumentation maintenance burden by 40% and gained the ability to switch between observability backends without changing application code. During Black Friday 2023, correlated traces and metrics enabled their SRE team to identify and resolve a database connection pool bottleneck in under 5 minutes.",
    "case_study_zh": "Shopify将其Ruby on Rails单体应用及周边微服务从Datadog APM、StatsD和自定义追踪的拼凑方案迁移到OpenTelemetry。通过标准化OTel自动埋点并部署OTel Collector集群，他们将埋点维护负担降低了40%，并获得了在不修改应用代码的情况下切换可观测性后端的能力。2023年黑色星期五期间，关联的追踪和指标使其SRE团队在5分钟内定位并解决了数据库连接池瓶颈。",
    "when_not_to_use": [
      "Simple single-service applications where built-in logging and basic APM provide sufficient visibility",
      "Teams deeply invested in a vendor-specific SDK (e.g., Datadog native) that already provides superior auto-instrumentation for their stack",
      "Embedded or IoT systems with extreme resource constraints where the OTel SDK overhead is prohibitive",
      "Short-lived prototypes where the instrumentation setup time exceeds the project's lifespan"
    ],
    "when_not_to_use_zh": [
      "内置日志和基础APM即可提供足够可见性的简单单服务应用",
      "已深度使用特定厂商SDK（如Datadog原生SDK）且该SDK已为其技术栈提供优秀自动埋点的团队",
      "资源极度受限的嵌入式或物联网系统，OTel SDK开销不可承受",
      "埋点搭建时间超过项目生命周期的短期原型"
    ],
    "adopters": [
      "Shopify",
      "GitHub",
      "eBay",
      "Splunk",
      "Grafana Labs"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "observability",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Majors, C., Fong-Jones, L. & Miranda, G. (2022). \"Observability Engineering\". O'Reilly Media.",
    "secondary_sources": [
      "CNCF (2019). \"OpenTelemetry Specification\". opentelemetry.io.",
      "Sigelman, B.H. et al. (2010). \"Dapper, a Large-Scale Distributed Systems Tracing Infrastructure\". Google Technical Report."
    ],
    "typed_relations": [
      {
        "slug": "distributed-tracing",
        "type": "complement"
      },
      {
        "slug": "structured-logging",
        "type": "complement"
      },
      {
        "slug": "four-golden-signals",
        "type": "complement"
      },
      {
        "slug": "use-method",
        "type": "alternative"
      },
      {
        "slug": "red-method",
        "type": "alternative"
      },
      {
        "slug": "sli-slo-sla",
        "type": "complement"
      }
    ]
  },
  {
    "id": 152,
    "name": "Distributed Tracing",
    "name_zh": "分布式追踪",
    "slug": "distributed-tracing",
    "category": "observability",
    "desc": "Track requests across service boundaries to diagnose latency and failures in distributed systems",
    "desc_zh": "跨服务边界追踪请求，诊断分布式系统中的延迟和故障",
    "steps": [
      "Generate a unique trace ID at the entry point (API gateway, load balancer) and propagate it through all downstream service calls via headers",
      "Create spans for each meaningful operation: record service name, operation name, start time, duration, status code, and custom attributes",
      "Propagate context across async boundaries: ensure trace context survives message queues, background jobs, and event-driven workflows",
      "Send span data to a trace backend (Jaeger, Zipkin, Tempo) and build a service dependency graph from collected traces",
      "Analyze trace waterfalls to identify latency bottlenecks, error propagation paths, and unexpected sequential calls that should be parallel"
    ],
    "steps_zh": [
      "在入口点（API网关、负载均衡器）生成唯一Trace ID，并通过请求头将其传播到所有下游服务调用",
      "为每个有意义的操作创建Span：记录服务名、操作名、开始时间、持续时间、状态码和自定义属性",
      "跨异步边界传播上下文：确保追踪上下文在消息队列、后台任务和事件驱动工作流中不丢失",
      "将Span数据发送到追踪后端（Jaeger、Zipkin、Tempo），并从收集的追踪中构建服务依赖图",
      "分析追踪瀑布图以识别延迟瓶颈、错误传播路径和应当并行却意外串行的调用"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Trace ID",
      "Spans",
      "Async Context",
      "Trace Backend",
      "Waterfall Analysis"
    ],
    "viz_labels_zh": [
      "追踪ID",
      "Span",
      "异步上下文",
      "追踪后端",
      "瀑布分析"
    ],
    "related": [
      "opentelemetry",
      "structured-logging",
      "four-golden-signals",
      "red-method",
      "circuit-breaker-pattern"
    ],
    "tags": [
      "observability",
      "distributed-tracing",
      "spans",
      "trace-context",
      "latency",
      "microservices"
    ],
    "origin_author": "Benjamin H. Sigelman et al. (Google Dapper), 2010",
    "origin_source": "Dapper, a Large-Scale Distributed Systems Tracing Infrastructure (Google Technical Report, 2010); Observability Engineering (Majors, Fong-Jones, Miranda, 2022)",
    "origin_source_zh": "《Dapper：大规模分布式系统追踪基础设施》（Google技术报告，2010）；《可观测性工程》（Majors、Fong-Jones、Miranda，2022）",
    "complexity": "intermediate",
    "when_to_use": [
      "Microservice architectures where a single user request fans out across multiple services and databases",
      "Diagnosing latency spikes that span multiple network hops and cannot be isolated to a single service",
      "Understanding service dependencies and call patterns in complex distributed systems",
      "Debugging intermittent failures that only occur under specific request paths or data combinations"
    ],
    "when_to_use_zh": [
      "单个用户请求扇出到多个服务和数据库的微服务架构",
      "诊断跨越多个网络跳数、无法隔离到单个服务的延迟峰值",
      "理解复杂分布式系统中的服务依赖和调用模式",
      "调试仅在特定请求路径或数据组合下出现的间歇性故障"
    ],
    "core_concepts": [
      "Trace: A directed acyclic graph of spans representing the complete lifecycle of a request as it traverses multiple services",
      "Span: A named, timed operation within a trace; each span records service, operation, duration, status, and parent span ID to form the tree structure",
      "Context Propagation: The mechanism of passing trace ID and span ID across process boundaries via HTTP headers (W3C Trace Context, B3) or message metadata",
      "Sampling: A strategy to reduce trace volume by collecting only a fraction of traces (head-based) or selectively retaining interesting traces (tail-based)",
      "Trace Waterfall: A visualization showing the timeline of all spans in a trace, revealing sequential vs parallel execution, bottlenecks, and error locations"
    ],
    "core_concepts_zh": [
      "Trace：由Span组成的有向无环图，代表请求在多个服务间传播的完整生命周期",
      "Span：追踪中一个命名的、计时的操作；每个Span记录服务、操作、持续时间、状态和父Span ID以形成树形结构",
      "上下文传播：通过HTTP头（W3C Trace Context、B3）或消息元数据在进程边界间传递Trace ID和Span ID的机制",
      "采样：通过仅收集部分追踪（头部采样）或选择性保留有价值追踪（尾部采样）来减少追踪量的策略",
      "追踪瀑布图：展示追踪中所有Span时间线的可视化，揭示串行与并行执行、瓶颈和错误位置"
    ],
    "timeline": [
      [
        "2010",
        "Google publishes the Dapper paper describing their production distributed tracing system"
      ],
      [
        "2012",
        "Twitter open-sources Zipkin, the first widely adopted open-source distributed tracing system"
      ],
      [
        "2015",
        "Uber develops and open-sources Jaeger, a Dapper-inspired tracing system built for cloud-native scale"
      ],
      [
        "2019",
        "W3C Trace Context specification becomes a standard, enabling interoperable context propagation across vendors"
      ],
      [
        "2022",
        "Distributed tracing becomes table stakes in cloud-native observability; Grafana Tempo offers cost-effective trace storage"
      ]
    ],
    "timeline_zh": [
      [
        "2010",
        "Google发表Dapper论文，描述其生产级分布式追踪系统"
      ],
      [
        "2012",
        "Twitter开源Zipkin，成为首个被广泛采用的开源分布式追踪系统"
      ],
      [
        "2015",
        "Uber开发并开源Jaeger，一个受Dapper启发、为云原生规模构建的追踪系统"
      ],
      [
        "2019",
        "W3C Trace Context规范成为标准，实现跨厂商的互操作上下文传播"
      ],
      [
        "2022",
        "分布式追踪成为云原生可观测性的标配；Grafana Tempo提供经济高效的追踪存储"
      ]
    ],
    "dos": [
      "Do propagate trace context through all communication channels including message queues, gRPC, and event buses, because broken context creates orphan spans",
      "Do add business-relevant attributes to spans (user ID, order ID, feature flag variant) so traces become queryable by business dimensions",
      "Do implement tail-based sampling in the collector to retain error traces and slow traces while dropping routine ones",
      "Do set up service dependency maps derived from traces to maintain an always-current architecture diagram"
    ],
    "dos_zh": [
      "务必通过所有通信通道（消息队列、gRPC、事件总线）传播追踪上下文，因为断裂的上下文会产生孤立Span",
      "为Span添加业务相关属性（用户ID、订单ID、功能开关变体），使追踪可按业务维度查询",
      "在采集器中实施尾部采样，保留错误追踪和慢追踪，丢弃常规追踪",
      "基于追踪建立服务依赖图，维护始终最新的架构图"
    ],
    "donts": [
      "Don't create a span for every function call, because excessive span granularity generates massive data volumes and obscures meaningful signals",
      "Don't rely solely on head-based sampling at high rates, because it randomly discards error and slow traces that are most valuable for debugging",
      "Don't forget to trace database queries and cache lookups, because they are often the largest contributors to request latency",
      "Don't ignore trace data retention policies, because unbounded storage of high-cardinality trace data leads to runaway costs"
    ],
    "donts_zh": [
      "不要为每个函数调用创建Span，因为过度的Span粒度会产生海量数据并淹没有意义的信号",
      "不要仅依赖高比率的头部采样，因为它会随机丢弃对调试最有价值的错误和慢追踪",
      "不要忘记对数据库查询和缓存查找进行追踪，因为它们往往是请求延迟的最大贡献者",
      "不要忽视追踪数据保留策略，因为对高基数追踪数据的无限存储会导致成本失控"
    ],
    "case_study_company": "Uber",
    "case_study": "Uber built Jaeger to trace requests across their 4,000+ microservices. Before Jaeger, debugging a failed ride request required manually correlating logs across dozens of services. With distributed tracing, engineers could visualize the complete request flow from the rider app through dispatch, pricing, driver matching, and payment services. Jaeger helped Uber reduce mean time to resolution (MTTR) for production incidents by over 60%, and the trace-derived service dependency graph became the single source of truth for their microservice architecture.",
    "case_study_zh": "Uber构建了Jaeger来追踪跨4000多个微服务的请求。在Jaeger之前，调试失败的叫车请求需要手动关联数十个服务的日志。有了分布式追踪，工程师可以可视化从乘客应用到调度、定价、司机匹配和支付服务的完整请求流。Jaeger帮助Uber将生产事故的平均解决时间（MTTR）降低了60%以上，而追踪派生的服务依赖图成为其微服务架构的唯一事实来源。",
    "when_not_to_use": [
      "Monolithic applications where all request processing happens in a single process and a profiler provides better insight",
      "Batch processing systems where request-oriented tracing does not align with the job-based execution model",
      "Systems processing fewer than 100 requests per second where simple logging provides sufficient debugging context",
      "Extremely latency-sensitive hot paths where the overhead of span creation and context propagation is unacceptable"
    ],
    "when_not_to_use_zh": [
      "所有请求处理在单进程内完成的单体应用——性能分析器能提供更好的洞察",
      "基于请求的追踪与基于任务的执行模型不匹配的批处理系统",
      "每秒处理请求不足100个的系统——简单日志已能提供足够的调试上下文",
      "Span创建和上下文传播开销不可接受的极端延迟敏感热路径"
    ],
    "adopters": [
      "Uber",
      "Netflix",
      "Airbnb",
      "Lyft",
      "Datadog"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "observability",
      "reliability",
      "performance"
    ],
    "maturity_ring": "established",
    "primary_source": "Sigelman, B.H. et al. (2010). \"Dapper, a Large-Scale Distributed Systems Tracing Infrastructure\". Google Technical Report.",
    "secondary_sources": [
      "Majors, C., Fong-Jones, L. & Miranda, G. (2022). \"Observability Engineering\". O'Reilly Media.",
      "Shkuro, Y. (2019). \"Mastering Distributed Tracing\". Packt Publishing."
    ],
    "typed_relations": [
      {
        "slug": "opentelemetry",
        "type": "complement"
      },
      {
        "slug": "structured-logging",
        "type": "complement"
      },
      {
        "slug": "four-golden-signals",
        "type": "complement"
      },
      {
        "slug": "red-method",
        "type": "complement"
      },
      {
        "slug": "circuit-breaker-pattern",
        "type": "complement"
      }
    ]
  },
  {
    "id": 153,
    "name": "Structured Logging",
    "name_zh": "结构化日志",
    "slug": "structured-logging",
    "category": "observability",
    "desc": "Machine-parseable log format patterns that enable reliable querying and correlation at scale",
    "desc_zh": "可机器解析的日志格式模式，支持大规模下的可靠查询和关联分析",
    "steps": [
      "Replace unstructured log strings with structured key-value pairs: emit logs as JSON objects with consistent field names (timestamp, level, service, message, trace_id)",
      "Define a logging schema: standardize field names, types, and required fields across all services in a shared logging library or convention document",
      "Enrich logs with contextual metadata: attach request ID, user ID, trace ID, deployment version, and environment to every log entry automatically",
      "Ship structured logs to a centralized log aggregation platform (ELK, Loki, CloudWatch Logs) via a unified agent or sidecar",
      "Build queryable dashboards and alerts: use structured fields to filter, group, and aggregate logs without relying on fragile regex patterns"
    ],
    "steps_zh": [
      "将非结构化日志字符串替换为结构化键值对：以JSON对象形式输出日志，使用一致的字段名（timestamp、level、service、message、trace_id）",
      "定义日志模式：在共享日志库或约定文档中跨所有服务标准化字段名、类型和必需字段",
      "用上下文元数据丰富日志：自动为每条日志附加请求ID、用户ID、Trace ID、部署版本和环境信息",
      "通过统一代理或Sidecar将结构化日志传送到集中式日志聚合平台（ELK、Loki、CloudWatch Logs）",
      "构建可查询的仪表盘和告警：使用结构化字段过滤、分组和聚合日志，无需依赖脆弱的正则表达式"
    ],
    "ai_relevant": true,
    "viz_type": "matrix",
    "viz_labels": [
      "JSON Fields",
      "Log Schema",
      "Context Enrich",
      "Log Aggregation",
      "Queryable Dashboards"
    ],
    "viz_labels_zh": [
      "JSON字段",
      "日志模式",
      "上下文丰富",
      "日志聚合",
      "可查询看板"
    ],
    "related": [
      "opentelemetry",
      "distributed-tracing",
      "four-golden-signals",
      "slo-as-practice"
    ],
    "tags": [
      "observability",
      "logging",
      "structured-logging",
      "json-logs",
      "log-aggregation",
      "elk"
    ],
    "origin_author": "Industry evolution from syslog; formalized by observability community, ~2013",
    "origin_source": "Observability Engineering (Majors, Fong-Jones, Miranda, 2022); Software Engineering at Google (Winters, Manshreck, Wright, 2020)",
    "origin_source_zh": "《可观测性工程》（Majors、Fong-Jones、Miranda，2022）；《Google软件工程》（Winters、Manshreck、Wright，2020）",
    "complexity": "beginner",
    "when_to_use": [
      "Any system producing logs that need to be queried, filtered, or correlated across services",
      "Microservice architectures where unstructured logs from dozens of services become impossible to search manually",
      "Compliance environments requiring auditable, machine-parseable log records with consistent schemas",
      "Teams adopting observability practices who need logs to correlate with traces and metrics via shared identifiers"
    ],
    "when_to_use_zh": [
      "任何需要查询、过滤或跨服务关联日志的系统",
      "数十个服务的非结构化日志变得无法手动搜索的微服务架构",
      "要求可审计、机器可解析且具有一致模式的日志记录的合规环境",
      "采用可观测性实践、需要日志通过共享标识符与追踪和指标关联的团队"
    ],
    "core_concepts": [
      "Key-Value Logging: Every log entry is a set of typed key-value pairs (JSON, logfmt) rather than a freeform string, enabling precise field-level queries",
      "Canonical Log Lines: A pattern where a single summary log line per request captures all important dimensions, reducing log volume while maximizing queryability",
      "Log Levels as Contracts: Structured severity levels (DEBUG, INFO, WARN, ERROR) are enforced consistently and used for alerting thresholds and retention policies",
      "Contextual Enrichment: Middleware or interceptors automatically inject request-scoped metadata (trace ID, user ID, request path) into every log entry",
      "Schema Evolution: Structured log schemas must be versioned and backward-compatible, just like API contracts, to prevent downstream parsing failures"
    ],
    "core_concepts_zh": [
      "键值日志：每条日志是一组类型化的键值对（JSON、logfmt）而非自由格式字符串，支持精确的字段级查询",
      "规范日志行：每个请求生成一条汇总日志行以捕获所有重要维度的模式，减少日志量同时最大化可查询性",
      "日志级别即契约：一致地强制执行结构化严重性级别（DEBUG、INFO、WARN、ERROR），用于告警阈值和保留策略",
      "上下文丰富：中间件或拦截器自动将请求范围的元数据（Trace ID、用户ID、请求路径）注入每条日志",
      "模式演进：结构化日志模式必须版本化且向后兼容，就像API契约一样，以防止下游解析失败"
    ],
    "timeline": [
      [
        "2001",
        "RFC 3164 defines the BSD syslog protocol, establishing early semi-structured log conventions"
      ],
      [
        "2013",
        "JSON logging becomes mainstream as ELK stack (Elasticsearch, Logstash, Kibana) gains widespread adoption"
      ],
      [
        "2015",
        "Stripe popularizes the canonical log lines pattern for high-cardinality event analysis"
      ],
      [
        "2019",
        "Cloud-native logging standards emerge with Fluentd and Fluent Bit as CNCF graduated projects"
      ],
      [
        "2022",
        "Observability Engineering book codifies structured logging as a core pillar alongside traces and metrics"
      ]
    ],
    "timeline_zh": [
      [
        "2001",
        "RFC 3164定义BSD syslog协议，建立早期半结构化日志约定"
      ],
      [
        "2013",
        "随着ELK栈（Elasticsearch、Logstash、Kibana）的广泛采用，JSON日志成为主流"
      ],
      [
        "2015",
        "Stripe推广规范日志行模式，用于高基数事件分析"
      ],
      [
        "2019",
        "云原生日志标准随Fluentd和Fluent Bit成为CNCF毕业项目而出现"
      ],
      [
        "2022",
        "《可观测性工程》将结构化日志编纂为与追踪和指标并列的核心支柱"
      ]
    ],
    "dos": [
      "Do use a shared logging library that enforces schema consistency across all services, because ad-hoc JSON formatting leads to field name drift",
      "Do include trace ID and span ID in every log entry, because this enables direct navigation from a log line to its parent trace",
      "Do adopt the canonical log lines pattern for high-throughput services, because one rich line per request is cheaper and more useful than scattered verbose logs",
      "Do set per-field retention policies, because debug-level logs and high-cardinality fields should expire faster than error logs"
    ],
    "dos_zh": [
      "使用共享日志库在所有服务间强制模式一致性，因为临时的JSON格式化会导致字段名漂移",
      "在每条日志中包含Trace ID和Span ID，因为这使得从日志行直接导航到其父追踪成为可能",
      "为高吞吐量服务采用规范日志行模式，因为每个请求一条丰富的日志行比分散的冗长日志更经济且更有用",
      "设置按字段的保留策略，因为调试级别日志和高基数字段应比错误日志更快过期"
    ],
    "donts": [
      "Don't log sensitive data (passwords, tokens, PII) in structured fields, because centralized log systems make data leaks searchable at scale",
      "Don't use string interpolation for log messages in hot paths, because constructing unused log strings wastes CPU even when the log level is disabled",
      "Don't embed stack traces as a single giant string field, because it prevents structured parsing of exception class, message, and frame data",
      "Don't allow unbounded log field cardinality (e.g., logging full request bodies), because it explodes index size and query costs in the aggregation backend"
    ],
    "donts_zh": [
      "不要在结构化字段中记录敏感数据（密码、令牌、PII），因为集中式日志系统使数据泄露可被大规模搜索",
      "不要在热路径中使用字符串插值构建日志消息，因为即使日志级别被禁用，构建未使用的日志字符串也会浪费CPU",
      "不要将堆栈追踪作为单个巨大字符串字段嵌入，因为这会阻止对异常类、消息和帧数据的结构化解析",
      "不要允许无界的日志字段基数（如记录完整请求体），因为它会使聚合后端的索引大小和查询成本爆炸"
    ],
    "case_study_company": "Stripe",
    "case_study": "Stripe pioneered the canonical log lines pattern where every API request produces exactly one structured log line containing all relevant dimensions: merchant ID, API version, endpoint, latency, response code, and feature flags active. This approach reduced their log volume by 10x compared to scattered multi-line logging while dramatically improving debugging speed. Engineers can query any combination of dimensions without writing regex, and the pattern became foundational to Stripe's high-cardinality observability strategy described by their engineering blog.",
    "case_study_zh": "Stripe率先实践了规范日志行模式，每个API请求恰好生成一条包含所有相关维度的结构化日志行：商户ID、API版本、端点、延迟、响应码和激活的功能开关。与分散的多行日志相比，这种方法将日志量减少了10倍，同时显著提高了调试速度。工程师无需编写正则表达式即可查询任意维度组合，该模式成为Stripe在其工程博客中描述的高基数可观测性策略的基础。",
    "when_not_to_use": [
      "Embedded systems with extreme memory constraints where JSON serialization overhead is prohibitive",
      "Local development debugging where human-readable plaintext logs are faster to scan visually",
      "Legacy systems where retrofitting structured logging across the entire codebase is cost-prohibitive and the remaining lifespan is short",
      "Ultra-high-throughput data pipelines where even minimal logging adds unacceptable latency"
    ],
    "when_not_to_use_zh": [
      "JSON序列化开销不可承受的极端内存受限嵌入式系统",
      "本地开发调试中人类可读的纯文本日志更便于视觉扫描",
      "在整个代码库中改造结构化日志成本过高且剩余生命周期较短的遗留系统",
      "即使最小日志开销也会增加不可接受延迟的超高吞吐量数据管道"
    ],
    "adopters": [
      "Stripe",
      "Datadog",
      "Cloudflare",
      "Honeycomb",
      "Grafana Labs"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "observability",
      "maintainability"
    ],
    "maturity_ring": "established",
    "primary_source": "Majors, C., Fong-Jones, L. & Miranda, G. (2022). \"Observability Engineering\". O'Reilly Media.",
    "secondary_sources": [
      "Winters, T., Manshreck, T. & Wright, H. (2020). \"Software Engineering at Google\", Ch. 16. O'Reilly Media.",
      "Beyer, B. et al. (2016). \"Site Reliability Engineering: How Google Runs Production Systems\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "opentelemetry",
        "type": "complement"
      },
      {
        "slug": "distributed-tracing",
        "type": "complement"
      },
      {
        "slug": "four-golden-signals",
        "type": "complement"
      },
      {
        "slug": "slo-as-practice",
        "type": "complement"
      }
    ]
  },
  {
    "id": 154,
    "name": "SLO-as-Practice",
    "name_zh": "SLO实践方法论",
    "slug": "slo-as-practice",
    "category": "observability",
    "desc": "Operationalize SLO methodology as a continuous engineering practice for reliability culture",
    "desc_zh": "将SLO方法论运营化为持续的工程实践，构建可靠性文化",
    "steps": [
      "Identify user journeys and map them to measurable SLIs: choose indicators that reflect real user happiness (successful login rate, checkout latency p99, search result relevance)",
      "Set SLO targets through stakeholder negotiation: balance user expectations, engineering cost, and business risk rather than defaulting to arbitrary nines",
      "Implement SLO-based alerting with multi-window burn rate: alert on budget consumption rate rather than raw threshold breaches to reduce false positives",
      "Establish error budget policies: define concrete actions at budget thresholds (50% remaining: review, 25%: freeze risky deploys, 0%: all hands on reliability)",
      "Run SLO review meetings monthly: analyze budget burn trends, adjust targets based on user feedback, and use SLO data to prioritize reliability vs feature work"
    ],
    "steps_zh": [
      "识别用户旅程并映射为可度量的SLI：选择反映真实用户满意度的指标（登录成功率、结算延迟p99、搜索结果相关性）",
      "通过利益相关者协商设定SLO目标：平衡用户期望、工程成本和业务风险，而非默认使用任意的几个9",
      "实施基于SLO的多窗口燃烧率告警：对预算消耗速率而非原始阈值触发告警，以减少误报",
      "建立错误预算策略：在预算阈值处定义具体行动（剩余50%：审查，25%：冻结高风险部署，0%：全力投入可靠性）",
      "每月召开SLO回顾会议：分析预算燃烧趋势，根据用户反馈调整目标，并使用SLO数据来确定可靠性与功能工作的优先级"
    ],
    "ai_relevant": true,
    "viz_type": "cycle",
    "viz_labels": [
      "SLI",
      "SLO Target",
      "Burn Rate Alert",
      "Error Budget Policy",
      "SLO Review"
    ],
    "viz_labels_zh": [
      "SLI指标",
      "SLO目标",
      "燃烧率告警",
      "错误预算策略",
      "SLO评审"
    ],
    "related": [
      "sli-slo-sla",
      "error-budget-policy",
      "four-golden-signals",
      "dora-metrics",
      "on-call-engineering"
    ],
    "tags": [
      "observability",
      "slo",
      "reliability",
      "error-budget",
      "burn-rate",
      "sre"
    ],
    "origin_author": "Google SRE team; codified in Implementing Service Level Objectives (Alex Hidalgo, 2020)",
    "origin_source": "Implementing Service Level Objectives (Alex Hidalgo, O'Reilly, 2020); Site Reliability Engineering (Beyer et al., 2016)",
    "origin_source_zh": "《实施服务水平目标》（Alex Hidalgo，O'Reilly，2020）；《SRE：Google运维解密》（Beyer等，2016）",
    "complexity": "advanced",
    "when_to_use": [
      "Organizations that have defined SLIs/SLOs but struggle to operationalize them into daily engineering decisions",
      "Teams drowning in alerts from threshold-based monitoring who need a more intelligent alerting approach",
      "Engineering leadership that needs objective data to balance reliability investment against feature velocity",
      "Post-incident reviews that repeatedly identify lack of reliability prioritization as a root cause"
    ],
    "when_to_use_zh": [
      "已定义SLI/SLO但难以将其运营化为日常工程决策的组织",
      "被基于阈值的监控告警淹没、需要更智能告警方法的团队",
      "需要客观数据来平衡可靠性投入与功能开发速度的工程领导层",
      "事后回顾反复将缺乏可靠性优先级化识别为根因的情况"
    ],
    "core_concepts": [
      "SLO as a Practice: SLOs are not just numbers in a config file; they are a continuous feedback loop of measuring, alerting, reviewing, and adjusting that shapes engineering culture",
      "Multi-Window Burn Rate Alerting: Compares error budget consumption across short (1h) and long (6h, 3d) windows to detect both sudden outages and gradual degradation with minimal false positives",
      "Error Budget as Currency: The error budget is a finite resource that engineering teams 'spend' on risky changes; when it is depleted, the team must shift to reliability work",
      "SLO Document: A living document per service specifying SLIs, SLO targets, error budget policies, escalation paths, and review cadence -- the contract between product and engineering",
      "Aspirational vs Achievable SLOs: Teams maintain two tiers -- achievable SLOs that trigger alerts and aspirational SLOs that guide long-term reliability investment"
    ],
    "core_concepts_zh": [
      "SLO即实践：SLO不只是配置文件中的数字；它是一个由度量、告警、回顾和调整组成的持续反馈循环，塑造工程文化",
      "多窗口燃烧率告警：比较短窗口（1小时）和长窗口（6小时、3天）的错误预算消耗，以最少的误报检测突发故障和渐进劣化",
      "错误预算即货币：错误预算是工程团队「花费」在有风险变更上的有限资源；耗尽时团队必须转向可靠性工作",
      "SLO文档：每个服务的活文档，规定SLI、SLO目标、错误预算策略、升级路径和回顾节奏——产品与工程之间的契约",
      "期望型与可达型SLO：团队维护两个层级——触发告警的可达SLO和指导长期可靠性投入的期望SLO"
    ],
    "timeline": [
      [
        "2003",
        "Google SRE team begins using error budgets internally to balance reliability and velocity"
      ],
      [
        "2016",
        "Site Reliability Engineering book publishes Google's SLO practices, sparking industry adoption"
      ],
      [
        "2018",
        "The SRE Workbook provides step-by-step SLO implementation guidance with worked examples"
      ],
      [
        "2020",
        "Alex Hidalgo publishes Implementing Service Level Objectives, the definitive practitioner's guide to SLO-as-practice"
      ],
      [
        "2023",
        "SLO-based alerting and error budget policies become standard practice at most cloud-native organizations"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "Google SRE团队开始在内部使用错误预算来平衡可靠性和开发速度"
      ],
      [
        "2016",
        "《SRE：Google运维解密》发布Google的SLO实践，引发行业采用"
      ],
      [
        "2018",
        "《SRE工作手册》提供带有实例的逐步SLO实施指导"
      ],
      [
        "2020",
        "Alex Hidalgo出版《实施服务水平目标》，成为SLO实践的权威从业者指南"
      ],
      [
        "2023",
        "基于SLO的告警和错误预算策略成为大多数云原生组织的标准实践"
      ]
    ],
    "dos": [
      "Do start with 2-3 SLOs per service covering the most critical user journeys, because too many SLOs dilute attention and make budget tracking unmanageable",
      "Do involve product managers in SLO target-setting, because reliability targets must reflect business context not just engineering preference",
      "Do automate SLO reporting and budget dashboards, because manual tracking guarantees the practice will be abandoned within months",
      "Do revisit SLO targets quarterly, because user expectations and system capabilities change over time"
    ],
    "dos_zh": [
      "每个服务从覆盖最关键用户旅程的2-3个SLO开始，因为过多SLO会分散注意力并使预算追踪不可管理",
      "让产品经理参与SLO目标设定，因为可靠性目标必须反映业务上下文而非仅仅工程偏好",
      "自动化SLO报告和预算仪表盘，因为手动追踪几乎保证该实践在数月内被放弃",
      "每季度重新审视SLO目标，因为用户期望和系统能力会随时间变化"
    ],
    "donts": [
      "Don't set SLOs without error budget policies, because SLOs without consequences are just dashboards that nobody acts on",
      "Don't alert on every SLO violation immediately, because multi-window burn rate alerting is far more actionable than raw threshold alerts",
      "Don't use SLOs as a performance evaluation tool for individual engineers, because it creates incentives to game the metrics rather than improve reliability",
      "Don't copy another company's SLO targets, because the right target depends on your users' expectations and your system's architecture"
    ],
    "donts_zh": [
      "不要在没有错误预算策略的情况下设定SLO，因为没有后果的SLO只是无人响应的仪表盘",
      "不要对每次SLO违反立即告警，因为多窗口燃烧率告警比原始阈值告警更可操作",
      "不要将SLO作为评估个人工程师绩效的工具，因为这会激励操纵指标而非改善可靠性",
      "不要复制其他公司的SLO目标，因为正确的目标取决于你的用户期望和系统架构"
    ],
    "case_study_company": "The Guardian",
    "case_study": "The Guardian newspaper's engineering team adopted SLO-as-practice to manage reliability across their content delivery platform serving 150+ million monthly readers. They defined SLOs around article load time (p99 < 3s), content API availability (99.9%), and search relevance. By implementing multi-window burn rate alerts, they reduced alert volume by 90% compared to their previous threshold-based system. Monthly SLO review meetings with product and editorial stakeholders created a shared language for reliability trade-offs, enabling the team to confidently invest in a major platform migration while staying within error budget.",
    "case_study_zh": "《卫报》的工程团队采用SLO实践方法论来管理其服务1.5亿月活读者的内容分发平台的可靠性。他们围绕文章加载时间（p99 < 3秒）、内容API可用性（99.9%）和搜索相关性定义SLO。通过实施多窗口燃烧率告警，他们将告警量比之前的阈值系统减少了90%。与产品和编辑利益相关者的月度SLO回顾会议为可靠性权衡创建了共同语言，使团队能够在保持错误预算范围内的情况下自信地进行重大平台迁移。",
    "when_not_to_use": [
      "Very early-stage startups where user patterns are unknown and setting meaningful SLOs is premature",
      "Internal tools used by fewer than 10 people where informal reliability communication is more efficient",
      "Systems where 100% correctness is legally required and error budgets are philosophically inapplicable",
      "Organizations without basic monitoring in place -- you need metrics before you can set objectives on them"
    ],
    "when_not_to_use_zh": [
      "用户模式未知且设定SLO为时过早的极早期初创公司",
      "少于10人使用的内部工具——非正式可靠性沟通更高效",
      "法律要求100%正确性、错误预算在哲学上不适用的系统",
      "尚未建立基础监控的组织——需要先有指标才能为其设定目标"
    ],
    "adopters": [
      "Google",
      "The Guardian",
      "Spotify",
      "Atlassian",
      "Honeycomb"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "reliability",
      "observability"
    ],
    "maturity_ring": "established",
    "primary_source": "Hidalgo, A. (2020). \"Implementing Service Level Objectives\". O'Reilly Media.",
    "secondary_sources": [
      "Beyer, B. et al. (2016). \"Site Reliability Engineering: How Google Runs Production Systems\", Ch. 4. O'Reilly Media.",
      "Beyer, B. et al. (2018). \"The Site Reliability Workbook\", Ch. 2. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "sli-slo-sla",
        "type": "extends"
      },
      {
        "slug": "error-budget-policy",
        "type": "complement"
      },
      {
        "slug": "four-golden-signals",
        "type": "prerequisite"
      },
      {
        "slug": "dora-metrics",
        "type": "complement"
      },
      {
        "slug": "on-call-engineering",
        "type": "complement"
      }
    ]
  },
  {
    "id": 155,
    "name": "Error Budget Policy",
    "name_zh": "错误预算策略",
    "slug": "error-budget-policy",
    "category": "observability",
    "desc": "Managing reliability vs velocity trade-offs through quantified error budgets and escalation policies",
    "desc_zh": "通过量化的错误预算和升级策略管理可靠性与开发速度的权衡",
    "steps": [
      "Calculate the error budget from SLO targets: if the SLO is 99.9% availability, the error budget is 0.1% of total requests (or ~43 minutes of downtime per 30-day window)",
      "Define budget consumption tracking: instrument real-time dashboards showing remaining budget as a percentage, burn rate, and projected exhaustion date",
      "Establish tiered policy actions: specify what happens at each threshold (e.g., 50% consumed: weekly review, 75%: halt risky deploys, 100%: reliability-only sprint)",
      "Create an escalation matrix: define who has authority to override the policy, under what circumstances, and what documentation is required",
      "Run retrospectives when budgets are exhausted or replenished: analyze what consumed the budget, whether the policy actions were effective, and adjust thresholds for the next period"
    ],
    "steps_zh": [
      "根据SLO目标计算错误预算：若SLO为99.9%可用性，错误预算为总请求的0.1%（或30天窗口内约43分钟停机时间）",
      "定义预算消耗追踪：构建实时仪表盘显示剩余预算百分比、燃烧率和预计耗尽日期",
      "建立分级策略行动：规定每个阈值处的行动（如消耗50%：每周审查，75%：暂停高风险部署，100%：纯可靠性冲刺）",
      "创建升级矩阵：定义谁有权覆盖策略、在什么情况下以及需要什么文档记录",
      "在预算耗尽或补充时进行回顾：分析什么消耗了预算、策略行动是否有效，并为下一周期调整阈值"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Error Budget",
      "Budget Tracking",
      "Tiered Policy",
      "Escalation Matrix",
      "Budget Retro"
    ],
    "viz_labels_zh": [
      "错误预算",
      "预算追踪",
      "分级策略",
      "升级矩阵",
      "预算回顾"
    ],
    "related": [
      "slo-as-practice",
      "sli-slo-sla",
      "on-call-engineering",
      "dora-metrics",
      "chaos-engineering"
    ],
    "tags": [
      "observability",
      "error-budget",
      "reliability",
      "velocity",
      "sre",
      "policy"
    ],
    "origin_author": "Google SRE team (Ben Treynor Sloss), ~2003; formalized in SRE Book",
    "origin_source": "Site Reliability Engineering: How Google Runs Production Systems (Beyer et al., 2016); The Site Reliability Workbook (Beyer et al., 2018)",
    "origin_source_zh": "《SRE：Google运维解密》（Beyer等，2016）；《SRE工作手册》（Beyer等，2018）",
    "complexity": "intermediate",
    "when_to_use": [
      "Teams locked in perpetual conflict between product managers pushing features and SREs demanding stability",
      "Organizations that need an objective, data-driven mechanism to decide when to freeze deploys vs ship faster",
      "Services where past incidents trace back to shipping too many risky changes without reliability investment",
      "Engineering leadership seeking to quantify the cost of unreliability in terms product teams understand"
    ],
    "when_to_use_zh": [
      "产品经理推动功能和SRE要求稳定之间长期冲突的团队",
      "需要客观、数据驱动机制来决定何时冻结部署与加速交付的组织",
      "过往事故根因追溯到在没有可靠性投入的情况下发布过多高风险变更的服务",
      "寻求用产品团队能理解的方式量化不可靠性成本的工程领导层"
    ],
    "core_concepts": [
      "Error Budget: The mathematically derived tolerance for unreliability (1 - SLO); a 99.9% SLO yields a 0.1% error budget, approximately 43 minutes per month",
      "Burn Rate: The speed at which the error budget is being consumed; a burn rate of 2x means the budget will exhaust in half the window period",
      "Policy Tiers: Predefined escalation levels triggered at specific budget consumption percentages, ranging from increased review to full deployment freezes",
      "Budget Replenishment: Error budgets reset at the start of each rolling window (typically 30 days), giving teams a fresh allocation to spend on innovation",
      "Override Authority: A documented exception process for cases where business-critical launches must proceed despite an exhausted budget, with mandatory post-mortems"
    ],
    "core_concepts_zh": [
      "错误预算：从不可靠性容忍度数学推导的值（1 - SLO）；99.9%的SLO产生0.1%的错误预算，约每月43分钟",
      "燃烧率：错误预算被消耗的速度；2倍燃烧率意味着预算将在窗口期的一半时间内耗尽",
      "策略层级：在特定预算消耗百分比处触发的预定义升级级别，从加强审查到完全冻结部署",
      "预算补充：错误预算在每个滚动窗口（通常30天）开始时重置，为团队提供新的创新配额",
      "覆盖权限：一个文档化的例外流程，用于业务关键发布必须在预算耗尽时仍然进行的情况，附带强制事后回顾"
    ],
    "timeline": [
      [
        "2003",
        "Google SRE introduces error budgets as an internal mechanism to resolve the tension between development and operations"
      ],
      [
        "2016",
        "The SRE Book publishes Google's error budget model, establishing it as an industry-recognized practice"
      ],
      [
        "2018",
        "The SRE Workbook provides detailed error budget policy templates and implementation examples"
      ],
      [
        "2020",
        "Implementing SLOs book formalizes error budget policies as the critical link between SLOs and organizational behavior"
      ],
      [
        "2023",
        "Error budget policies are integrated into observability platforms (Nobl9, Datadog SLO) as automated policy engines"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "Google SRE引入错误预算作为解决开发与运维间张力的内部机制"
      ],
      [
        "2016",
        "《SRE：Google运维解密》发布Google的错误预算模型，将其确立为行业认可的实践"
      ],
      [
        "2018",
        "《SRE工作手册》提供详细的错误预算策略模板和实施示例"
      ],
      [
        "2020",
        "《实施服务水平目标》将错误预算策略正式化为连接SLO与组织行为的关键环节"
      ],
      [
        "2023",
        "错误预算策略被集成到可观测性平台（Nobl9、Datadog SLO）中作为自动化策略引擎"
      ]
    ],
    "dos": [
      "Do make the error budget policy a written, signed document agreed upon by engineering, product, and management before incidents occur",
      "Do automate budget tracking and alerting so that policy actions are triggered by data, not by someone remembering to check a dashboard",
      "Do include positive incentives for teams with remaining budget (more experimental deployments, hack time) alongside the restrictive actions for exhaustion",
      "Do treat error budget policy violations (shipping despite exhaustion without override) as a process failure warranting retrospective"
    ],
    "dos_zh": [
      "将错误预算策略制定为书面、签署的文档，在事故发生前由工程、产品和管理层共同同意",
      "自动化预算追踪和告警，使策略行动由数据触发，而非依赖于某人记得去检查仪表盘",
      "在预算耗尽的限制性行动之外，为仍有预算剩余的团队设置正向激励（更多实验性部署、黑客时间）",
      "将错误预算策略违反（在没有覆盖授权的情况下耗尽后仍发布）视为需要回顾的流程失败"
    ],
    "donts": [
      "Don't set error budget policies without executive buy-in, because a policy that product leadership can ignore at will provides no value",
      "Don't make the budget window too short (e.g., 7 days), because short windows are too volatile and trigger false deployment freezes from single incidents",
      "Don't use error budgets to punish teams, because it creates incentives to hide incidents or inflate SLO targets to make budgets artificially large",
      "Don't apply the same policy rigidity to all services, because a 0.1% budget for a payment service has different business impact than 0.1% for a recommendations widget"
    ],
    "donts_zh": [
      "不要在没有高管支持的情况下设定错误预算策略，因为产品领导可以随意忽略的策略毫无价值",
      "不要将预算窗口设置过短（如7天），因为短窗口波动性太大，单个事故就会触发错误的部署冻结",
      "不要用错误预算惩罚团队，因为这会产生隐瞒事故或虚高SLO目标以人为扩大预算的动机",
      "不要对所有服务应用同样严格的策略，因为支付服务0.1%预算的业务影响与推荐组件的0.1%截然不同"
    ],
    "case_study_company": "Google",
    "case_study": "Google's internal error budget policy is the canonical example: when a service like Gmail exhausts its error budget, the development team must halt feature launches and dedicate the next sprint to reliability improvements -- fixing flaky tests, addressing tech debt, and hardening infrastructure. This policy resolved a decades-old organizational tension by making reliability investment automatic rather than political. Teams that consistently stay within budget earn more autonomy for risky experiments, creating a positive feedback loop. The model's success at Google led to widespread industry adoption after the SRE Book's publication in 2016.",
    "case_study_zh": "Google的内部错误预算策略是经典案例：当Gmail等服务耗尽错误预算时，开发团队必须暂停功能发布，将下一个冲刺专注于可靠性改进——修复不稳定测试、解决技术债和加固基础设施。这一策略通过使可靠性投入自动化而非政治化，解决了长达数十年的组织张力。持续保持在预算范围内的团队获得更多进行风险实验的自主权，形成正向反馈循环。该模型在Google的成功导致2016年SRE Book出版后被行业广泛采用。",
    "when_not_to_use": [
      "Small teams without dedicated SRE function where the policy overhead exceeds the coordination benefit",
      "Systems with legal zero-tolerance requirements for errors where error budgets are conceptually inapplicable",
      "Pre-product-market-fit startups where speed is existential and any deployment friction could be fatal",
      "Organizations lacking basic SLO measurement infrastructure -- you cannot enforce budgets you cannot measure"
    ],
    "when_not_to_use_zh": [
      "没有专门SRE职能的小型团队——策略开销超过协调收益",
      "对错误有法律零容忍要求的系统——错误预算在概念上不适用",
      "尚未达到产品-市场契合的初创公司——速度关乎生存，任何部署摩擦都可能是致命的",
      "缺乏基础SLO度量基础设施的组织——无法执行无法度量的预算"
    ],
    "adopters": [
      "Google",
      "Netflix",
      "Uber",
      "Dropbox",
      "LinkedIn"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "reliability",
      "observability"
    ],
    "maturity_ring": "established",
    "primary_source": "Beyer, B. et al. (2016). \"Site Reliability Engineering: How Google Runs Production Systems\", Ch. 3. O'Reilly Media.",
    "secondary_sources": [
      "Beyer, B. et al. (2018). \"The Site Reliability Workbook\", Ch. 3. O'Reilly Media.",
      "Hidalgo, A. (2020). \"Implementing Service Level Objectives\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "slo-as-practice",
        "type": "extends"
      },
      {
        "slug": "sli-slo-sla",
        "type": "prerequisite"
      },
      {
        "slug": "on-call-engineering",
        "type": "complement"
      },
      {
        "slug": "dora-metrics",
        "type": "complement"
      },
      {
        "slug": "chaos-engineering",
        "type": "complement"
      }
    ]
  },
  {
    "id": 156,
    "name": "Runbook Automation",
    "name_zh": "运维手册自动化",
    "slug": "runbook-automation",
    "category": "observability",
    "desc": "Codified incident response procedures that reduce toil and human error during incidents",
    "desc_zh": "将事故响应流程代码化，减少事故期间的重复劳动和人为失误",
    "steps": [
      "Audit existing manual runbooks: catalog all incident response procedures and classify them by frequency, complexity, and risk of human error",
      "Prioritize automation candidates: start with high-frequency, low-risk procedures (restart service, clear cache, scale up) that consume the most on-call toil",
      "Codify runbooks as executable scripts or workflows: use infrastructure automation tools (Ansible, Terraform, custom scripts) with safety guards (dry-run mode, blast radius limits, approval gates)",
      "Integrate automated runbooks with alerting: connect alerts to auto-remediation workflows so common issues self-heal without paging humans",
      "Continuously validate and iterate: test runbook automations regularly against staging, track mean time to remediation (MTTR), and update scripts as the system evolves"
    ],
    "steps_zh": [
      "审计现有手动运维手册：编目所有事故响应流程，按频率、复杂度和人为失误风险分类",
      "确定自动化候选者优先级：从消耗最多值班人工的高频率、低风险流程开始（重启服务、清除缓存、扩容）",
      "将运维手册编码为可执行脚本或工作流：使用基础设施自动化工具（Ansible、Terraform、自定义脚本）并加入安全防护（试运行模式、爆炸半径限制、审批门控）",
      "将自动化运维手册与告警集成：连接告警与自动修复工作流，使常见问题在不呼叫人员的情况下自愈",
      "持续验证和迭代：定期在预发布环境测试运维手册自动化，追踪平均修复时间（MTTR），随系统演进更新脚本"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Audit Runbooks",
      "Prioritize",
      "Codify Scripts",
      "Alert Integration",
      "Validate Iterate"
    ],
    "viz_labels_zh": [
      "审计手册",
      "优先排序",
      "代码化脚本",
      "告警集成",
      "持续验证"
    ],
    "related": [
      "on-call-engineering",
      "error-budget-policy",
      "slo-as-practice",
      "infrastructure-as-code",
      "chaos-engineering"
    ],
    "tags": [
      "observability",
      "runbook",
      "automation",
      "incident-response",
      "toil-reduction",
      "sre"
    ],
    "origin_author": "Google SRE team; formalized in Site Reliability Engineering book (2016)",
    "origin_source": "Site Reliability Engineering (Beyer et al., 2016); Software Engineering at Google (Winters, Manshreck, Wright, 2020)",
    "origin_source_zh": "《SRE：Google运维解密》（Beyer等，2016）；《Google软件工程》（Winters、Manshreck、Wright，2020）",
    "complexity": "intermediate",
    "when_to_use": [
      "On-call teams spending more than 50% of their time on repetitive, manual remediation tasks (toil)",
      "Incident response procedures that are well-understood and follow a deterministic decision tree",
      "Organizations where manual runbook execution during 3am incidents leads to high error rates",
      "Services with recurring failure modes (disk full, certificate expiry, connection pool exhaustion) that follow the same fix pattern"
    ],
    "when_to_use_zh": [
      "值班团队超过50%时间用于重复性手动修复任务（苦力活）",
      "已被充分理解且遵循确定性决策树的事故响应流程",
      "凌晨3点手动执行运维手册导致高错误率的组织",
      "具有循环故障模式（磁盘满、证书过期、连接池耗尽）且修复模式相同的服务"
    ],
    "core_concepts": [
      "Toil: Repetitive, manual, automatable operational work that scales linearly with service size and provides no enduring value -- the primary target for runbook automation",
      "Auto-Remediation: Automated workflows triggered directly by alerts that execute predefined fix steps without human intervention for known failure patterns",
      "Safety Guards: Dry-run modes, blast radius limits, rollback triggers, and human approval gates that prevent automated runbooks from making incidents worse",
      "Runbook-as-Code: Version-controlled, tested, and reviewed automation scripts stored alongside the service code they operate on, following software engineering best practices",
      "Escalation Ladder: When automation fails or encounters an unknown condition, it stops and escalates to a human with full context rather than retrying blindly"
    ],
    "core_concepts_zh": [
      "苦力活（Toil）：随服务规模线性增长且不提供持久价值的重复性、手动、可自动化的运维工作——运维手册自动化的主要目标",
      "自动修复：由告警直接触发的自动化工作流，无需人工介入即可对已知故障模式执行预定义修复步骤",
      "安全防护：试运行模式、爆炸半径限制、回滚触发器和人工审批门控，防止自动化运维手册使事故恶化",
      "运维手册即代码：版本控制、测试和审查的自动化脚本，与其操作的服务代码一起存储，遵循软件工程最佳实践",
      "升级阶梯：当自动化失败或遇到未知情况时，携带完整上下文停止并升级给人类，而非盲目重试"
    ],
    "timeline": [
      [
        "2003",
        "Google SRE begins systematic elimination of toil, establishing the principle that toil should not exceed 50% of an SRE's time"
      ],
      [
        "2012",
        "PagerDuty launches incident response automation, making runbook-triggered workflows accessible to smaller teams"
      ],
      [
        "2016",
        "Google SRE Book codifies toil budgets and runbook automation as core SRE practices"
      ],
      [
        "2019",
        "Shoreline.io and other platforms emerge offering auto-remediation as a service with built-in safety rails"
      ],
      [
        "2023",
        "AI-assisted runbooks emerge: LLMs help generate, validate, and execute remediation steps from incident context"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "Google SRE开始系统性消除苦力活，确立苦力活不应超过SRE时间50%的原则"
      ],
      [
        "2012",
        "PagerDuty推出事故响应自动化，使运维手册触发的工作流对较小团队可用"
      ],
      [
        "2016",
        "Google SRE Book将苦力活预算和运维手册自动化编纂为核心SRE实践"
      ],
      [
        "2019",
        "Shoreline.io等平台出现，提供带有内置安全护栏的自动修复即服务"
      ],
      [
        "2023",
        "AI辅助运维手册出现：LLM帮助从事故上下文中生成、验证和执行修复步骤"
      ]
    ],
    "dos": [
      "Do start by automating the simplest and most frequent runbooks first, because quick wins build organizational trust in automation",
      "Do include blast radius limits in every automated runbook (e.g., only restart 10% of pods at a time), because unconstrained automation can amplify incidents",
      "Do version-control runbook automation alongside the service code, because runbooks must evolve with the system they remediate",
      "Do log every automated action with full context, because post-incident review requires a complete audit trail of what the automation did"
    ],
    "dos_zh": [
      "从最简单和最频繁的运维手册开始自动化，因为快速见效能建立组织对自动化的信任",
      "在每个自动化运维手册中包含爆炸半径限制（如一次仅重启10%的Pod），因为不受约束的自动化可能放大事故",
      "将运维手册自动化与服务代码一起版本控制，因为运维手册必须与其修复的系统一起演进",
      "记录每个自动化操作的完整上下文日志，因为事后回顾需要自动化行为的完整审计追踪"
    ],
    "donts": [
      "Don't automate procedures that require human judgment or context-dependent decisions, because automation cannot reason about novel failure modes",
      "Don't skip dry-run testing of automated runbooks, because an untested automation script in production is more dangerous than a manual runbook",
      "Don't let auto-remediation mask chronic problems, because repeatedly auto-fixing the same issue hides the need for a proper root cause fix",
      "Don't build automation without an escalation path, because automation must know when to stop and ask for help rather than retrying indefinitely"
    ],
    "donts_zh": [
      "不要自动化需要人类判断或依赖上下文决策的流程，因为自动化无法对新型故障模式进行推理",
      "不要跳过自动化运维手册的试运行测试，因为未经测试的自动化脚本在生产中比手动运维手册更危险",
      "不要让自动修复掩盖慢性问题，因为反复自动修复相同问题会隐藏进行根因修复的需要",
      "不要构建没有升级路径的自动化，因为自动化必须知道何时停止并请求帮助而非无限重试"
    ],
    "case_study_company": "Google",
    "case_study": "Google SRE's toil budget principle mandates that SREs spend no more than 50% of their time on operational toil. When the Gmail team found that disk-related alerts accounted for 30% of on-call pages, they automated the entire remediation workflow: detect disk pressure, migrate affected shards, provision new storage, and validate service health -- all without human intervention. This single automation reduced on-call pages by 25% and freed engineers to work on proactive reliability improvements. The success pattern was replicated across Google, with automated runbooks now handling over 60% of common incident types.",
    "case_study_zh": "Google SRE的苦力活预算原则要求SRE用于运维苦力活的时间不超过50%。当Gmail团队发现磁盘相关告警占值班呼叫的30%时，他们自动化了整个修复工作流：检测磁盘压力、迁移受影响的分片、配置新存储并验证服务健康——全程无需人工介入。这单项自动化减少了25%的值班呼叫，释放工程师投入主动可靠性改进。该成功模式在Google全公司复制，自动化运维手册现在处理超过60%的常见事故类型。",
    "when_not_to_use": [
      "Novel or rare failure modes that have not been seen before and lack a known remediation procedure",
      "Procedures involving irreversible destructive actions (data deletion, key rotation) where human verification is essential",
      "Small-scale systems where the investment in building and maintaining automation exceeds the toil it would save",
      "Organizations without a blameless incident culture, where automation failures would be punished rather than learned from"
    ],
    "when_not_to_use_zh": [
      "从未遇到过且缺乏已知修复流程的新型或罕见故障模式",
      "涉及不可逆破坏性操作（数据删除、密钥轮换）且人工验证不可或缺的流程",
      "构建和维护自动化的投入超过其能节省的苦力活的小规模系统",
      "缺乏无指责事故文化的组织——自动化失败会被惩罚而非从中学习"
    ],
    "adopters": [
      "Google",
      "Amazon",
      "PagerDuty",
      "Shoreline.io",
      "Datadog"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "observability"
    ],
    "maturity_ring": "established",
    "primary_source": "Beyer, B. et al. (2016). \"Site Reliability Engineering: How Google Runs Production Systems\", Ch. 7. O'Reilly Media.",
    "secondary_sources": [
      "Winters, T., Manshreck, T. & Wright, H. (2020). \"Software Engineering at Google\". O'Reilly Media.",
      "Limoncelli, T.A., Chalup, S.R. & Hogan, C.J. (2014). \"The Practice of Cloud System Administration\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "on-call-engineering",
        "type": "complement"
      },
      {
        "slug": "error-budget-policy",
        "type": "complement"
      },
      {
        "slug": "slo-as-practice",
        "type": "complement"
      },
      {
        "slug": "infrastructure-as-code",
        "type": "complement"
      },
      {
        "slug": "chaos-engineering",
        "type": "complement"
      }
    ]
  },
  {
    "id": 157,
    "name": "On-Call Engineering",
    "name_zh": "值班工程实践",
    "slug": "on-call-engineering",
    "category": "observability",
    "desc": "Sustainable on-call practices, escalation paths, and human-centric incident response",
    "desc_zh": "可持续的值班实践、升级路径和以人为中心的事故响应",
    "steps": [
      "Design the on-call rotation: ensure at least 8 people in the rotation for sustainable coverage, with primary and secondary responders and clear handoff procedures",
      "Define escalation policies: establish time-based escalation (page secondary after 15min, escalate to management after 30min) and severity-based routing",
      "Equip on-call engineers: provide runbooks for every alert, one-click access to dashboards, and pre-authorized remediation playbooks so responders can act immediately",
      "Conduct blameless post-incident reviews: after every significant incident, hold a structured retrospective focused on systemic improvements, not individual blame",
      "Measure and improve on-call health: track pages per shift, false positive rate, MTTR, sleep interruption frequency, and use these metrics to drive toil reduction"
    ],
    "steps_zh": [
      "设计值班轮换：确保轮换中至少8人以实现可持续覆盖，设立主值和副值并明确交接流程",
      "定义升级策略：建立基于时间的升级（15分钟后呼叫副值，30分钟后升级到管理层）和基于严重性的路由",
      "为值班工程师提供装备：为每个告警提供运维手册、一键访问仪表盘和预授权的修复手册，使响应者能立即行动",
      "进行无指责的事后回顾：每次重大事故后举行结构化复盘，聚焦系统性改进而非个人指责",
      "度量并改善值班健康度：追踪每班呼叫数、误报率、MTTR、睡眠中断频率，并用这些指标驱动苦力活减少"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Rotation Design",
      "Escalation Policy",
      "Equip Engineers",
      "Post-Incident Review",
      "On-Call Health"
    ],
    "viz_labels_zh": [
      "轮值设计",
      "升级策略",
      "工具准备",
      "事后复盘",
      "健康度量"
    ],
    "related": [
      "runbook-automation",
      "error-budget-policy",
      "slo-as-practice",
      "sli-slo-sla"
    ],
    "tags": [
      "observability",
      "on-call",
      "incident-response",
      "escalation",
      "sre",
      "toil"
    ],
    "origin_author": "Google SRE team; systematized by Ben Treynor Sloss, ~2003",
    "origin_source": "Site Reliability Engineering (Beyer et al., 2016), Ch. 11 'Being On-Call'; Observability Engineering (Majors, Fong-Jones, Miranda, 2022)",
    "origin_source_zh": "《SRE：Google运维解密》（Beyer等，2016）第11章「值班」；《可观测性工程》（Majors、Fong-Jones、Miranda，2022）",
    "complexity": "intermediate",
    "when_to_use": [
      "Any production system that requires human response to incidents outside business hours",
      "Teams experiencing on-call burnout from excessive pages, unclear escalation, or insufficient support",
      "Organizations transitioning from 'developers throw code over the wall to ops' to shared production ownership",
      "Growing engineering teams that need to formalize ad-hoc incident response into a scalable system"
    ],
    "when_to_use_zh": [
      "任何需要在工作时间之外对事故进行人工响应的生产系统",
      "因过多呼叫、升级路径不清晰或支持不足而经历值班倦怠的团队",
      "从「开发把代码扔给运维」转向共享生产所有权的组织",
      "需要将临时事故响应正式化为可扩展系统的成长中工程团队"
    ],
    "core_concepts": [
      "Sustainable Rotation: A minimum of 8 engineers per rotation ensures no one is on-call more than 25% of the time, preventing burnout and maintaining response quality",
      "Escalation Policy: A time-and-severity-based decision tree that routes pages to the right responder and automatically escalates when initial response does not resolve within defined windows",
      "Blameless Post-Incident Review: A structured analysis of incidents focused on systemic causes, timeline reconstruction, and actionable improvements -- never on assigning individual blame",
      "On-Call Compensation: Explicit recognition that on-call work is real work deserving compensation (time off, pay), not an invisible tax on engineers' personal time",
      "Alert Quality: Every alert must be actionable, unique, and linked to a runbook; non-actionable alerts are noise that degrades on-call health and response effectiveness"
    ],
    "core_concepts_zh": [
      "可持续轮换：每个轮换至少8名工程师，确保无人值班时间超过25%，防止倦怠并维持响应质量",
      "升级策略：基于时间和严重性的决策树，将呼叫路由到正确的响应者，并在初始响应未在定义窗口内解决时自动升级",
      "无指责事后回顾：聚焦于系统性原因、时间线重建和可操作改进的结构化事故分析——绝不归咎于个人",
      "值班补偿：明确承认值班工作是真正的工作，值得补偿（调休、报酬），而非对工程师个人时间的隐形征税",
      "告警质量：每个告警必须可操作、唯一且关联运维手册；不可操作的告警是噪音，会降低值班健康度和响应效果"
    ],
    "timeline": [
      [
        "2003",
        "Google SRE establishes formal on-call practices with the principle that SREs should spend at most 25% of time on on-call duties"
      ],
      [
        "2011",
        "PagerDuty popularizes modern on-call management with mobile alerting, escalation automation, and schedule management"
      ],
      [
        "2016",
        "Google SRE Book publishes comprehensive on-call guidelines including rotation sizing, compensation, and post-incident review"
      ],
      [
        "2018",
        "Charity Majors and the Honeycomb team advocate for 'observability-driven' on-call where high-cardinality data replaces dashboard staring"
      ],
      [
        "2022",
        "On-call equity becomes a widespread industry concern; companies formalize compensation, rotation fairness, and burnout prevention"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "Google SRE建立正式值班实践，确立SRE用于值班的时间最多不超过25%的原则"
      ],
      [
        "2011",
        "PagerDuty通过移动告警、升级自动化和排班管理普及了现代值班管理"
      ],
      [
        "2016",
        "Google SRE Book发布全面的值班指南，包括轮换规模、补偿和事后回顾"
      ],
      [
        "2018",
        "Charity Majors和Honeycomb团队倡导「可观测性驱动」的值班模式，用高基数数据取代盯仪表盘"
      ],
      [
        "2022",
        "值班公平性成为行业普遍关注的问题；公司正式化补偿、轮换公平性和倦怠预防"
      ]
    ],
    "dos": [
      "Do ensure every alert has a linked runbook that tells the responder exactly what to investigate and how to remediate, because alert without context wastes precious incident time",
      "Do hold blameless post-incident reviews for every significant incident, because systemic improvements only emerge when people feel safe sharing what went wrong",
      "Do track on-call health metrics (pages per shift, MTTR, sleep interruptions) and set improvement targets, because what you don't measure you cannot improve",
      "Do provide explicit on-call compensation (extra pay, comp time, reduced next-week workload), because uncompensated on-call drives attrition"
    ],
    "dos_zh": [
      "确保每个告警关联一份运维手册，告诉响应者确切要调查什么和如何修复，因为没有上下文的告警浪费宝贵的事故时间",
      "为每次重大事故举行无指责事后回顾，因为只有当人们感到安全地分享出了什么问题时，系统性改进才会出现",
      "追踪值班健康指标（每班呼叫数、MTTR、睡眠中断）并设定改进目标，因为不度量的东西无法改进",
      "提供明确的值班补偿（额外报酬、调休、减少下周工作量），因为无补偿的值班会推动人才流失"
    ],
    "donts": [
      "Don't have fewer than 8 people in a rotation, because smaller rotations lead to unsustainable on-call frequency and rapid burnout",
      "Don't tolerate alert noise -- if an alert fires and the on-call ignores it because it is always a false positive, that alert must be fixed or deleted",
      "Don't skip post-incident reviews for 'small' incidents, because pattern analysis across many small incidents often reveals systemic issues",
      "Don't put junior engineers on-call without a shadow period and a senior secondary, because unsupported on-call is both ineffective and psychologically harmful"
    ],
    "donts_zh": [
      "不要让轮换中少于8人，因为较小的轮换导致不可持续的值班频率和快速倦怠",
      "不要容忍告警噪音——如果告警响了而值班人员因为它总是误报而忽略它，那个告警必须被修复或删除",
      "不要因为是「小」事故就跳过事后回顾，因为对许多小事故的模式分析往往能揭示系统性问题",
      "不要在没有跟随期和高级副值的情况下让初级工程师值班，因为无支持的值班既无效又对心理有害"
    ],
    "case_study_company": "PagerDuty",
    "case_study": "PagerDuty practices what they preach by running their own platform with sophisticated on-call engineering practices. They maintain an 8-person primary rotation with automatic escalation after 5 minutes. Every alert links to a runbook, and non-actionable alerts are tracked as 'noise budget' similar to an error budget. After implementing their on-call health metrics program, they reduced after-hours pages by 40% over 6 months and improved MTTR from 45 minutes to 15 minutes. Their blameless post-incident review process generates an average of 3 systemic improvements per incident, fed directly into sprint planning.",
    "case_study_zh": "PagerDuty践行自己倡导的理念，用精密的值班工程实践运行自己的平台。他们维护一个8人主值轮换，5分钟后自动升级。每个告警关联一份运维手册，不可操作的告警作为类似错误预算的「噪音预算」被追踪。实施值班健康指标计划后，他们在6个月内将工作时间外呼叫减少了40%，MTTR从45分钟改善到15分钟。他们的无指责事后回顾流程平均每次事故产生3项系统性改进，直接纳入冲刺规划。",
    "when_not_to_use": [
      "Internal tools that are only used during business hours and do not require 24/7 incident response",
      "Very early-stage products with no paying customers where formal on-call adds overhead without proportional benefit",
      "Fully serverless architectures managed by a cloud provider where the provider handles infrastructure incidents",
      "Systems where automated remediation handles all known failure modes and human intervention is genuinely unnecessary"
    ],
    "when_not_to_use_zh": [
      "仅在工作时间使用且不需要7x24事故响应的内部工具",
      "没有付费客户的极早期产品——正式值班增加开销而无相称收益",
      "由云提供商管理的全无服务器架构——提供商处理基础设施事故",
      "自动修复处理所有已知故障模式且人工介入确实不必要的系统"
    ],
    "adopters": [
      "Google",
      "PagerDuty",
      "Atlassian",
      "Microsoft",
      "Meta"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "reliability",
      "observability"
    ],
    "maturity_ring": "established",
    "primary_source": "Beyer, B. et al. (2016). \"Site Reliability Engineering: How Google Runs Production Systems\", Ch. 11. O'Reilly Media.",
    "secondary_sources": [
      "Majors, C., Fong-Jones, L. & Miranda, G. (2022). \"Observability Engineering\". O'Reilly Media.",
      "Beyer, B. et al. (2018). \"The Site Reliability Workbook\", Ch. 8. O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "runbook-automation",
        "type": "complement"
      },
      {
        "slug": "error-budget-policy",
        "type": "complement"
      },
      {
        "slug": "slo-as-practice",
        "type": "complement"
      },
      {
        "slug": "sli-slo-sla",
        "type": "prerequisite"
      }
    ]
  },
  {
    "id": 158,
    "name": "Feature Flag Observability",
    "name_zh": "功能开关可观测性",
    "slug": "feature-flag-observability",
    "category": "observability",
    "desc": "Monitoring feature rollout impact by correlating flag state with system and business metrics",
    "desc_zh": "通过关联功能开关状态与系统和业务指标来监控功能发布的影响",
    "steps": [
      "Instrument feature flag evaluations: emit telemetry events for every flag evaluation including flag name, variant, user segment, and evaluation context",
      "Attach flag state to observability signals: add active feature flag variants as attributes on traces, metrics, and logs so all telemetry becomes flag-aware",
      "Build comparative dashboards: display key metrics (error rate, latency, conversion rate) split by flag variant to detect impact in real time",
      "Set automated rollback triggers: define metric thresholds per flag that automatically disable a flag variant when it degrades system health beyond tolerance",
      "Analyze flag lifecycle impact: after full rollout, compare before/after metrics to validate the feature's value and generate evidence for stakeholder reviews"
    ],
    "steps_zh": [
      "对功能开关评估进行埋点：为每次开关评估发出遥测事件，包含开关名、变体、用户分组和评估上下文",
      "将开关状态附加到可观测性信号：将激活的功能开关变体作为属性添加到追踪、指标和日志中，使所有遥测具备开关感知能力",
      "构建比较仪表盘：按开关变体拆分显示关键指标（错误率、延迟、转化率），实时检测影响",
      "设置自动回滚触发器：为每个开关定义指标阈值，当系统健康度下降超过容忍范围时自动禁用该开关变体",
      "分析开关生命周期影响：全量发布后比较前后指标以验证功能价值，为利益相关者审查生成证据"
    ],
    "ai_relevant": true,
    "viz_type": "matrix",
    "viz_labels": [
      "Flag Evaluation",
      "Attach to Signals",
      "Variant Dashboard",
      "Auto Rollback",
      "Lifecycle Analyze"
    ],
    "viz_labels_zh": [
      "标志评估",
      "信号附加",
      "变体看板",
      "自动回滚",
      "生命周期分析"
    ],
    "related": [
      "feature-flags",
      "opentelemetry",
      "canary-deployment",
      "slo-as-practice",
      "red-method"
    ],
    "tags": [
      "observability",
      "feature-flags",
      "rollout",
      "experimentation",
      "metrics-correlation",
      "progressive-delivery"
    ],
    "origin_author": "LaunchDarkly, Split.io, and progressive delivery community, ~2017",
    "origin_source": "Observability Engineering (Majors, Fong-Jones, Miranda, 2022); OpenFeature specification (CNCF, 2022)",
    "origin_source_zh": "《可观测性工程》（Majors、Fong-Jones、Miranda，2022）；OpenFeature规范（CNCF，2022）",
    "complexity": "intermediate",
    "when_to_use": [
      "Progressive delivery workflows where features are rolled out incrementally and must be monitored for impact at each stage",
      "A/B testing and experimentation programs that need statistical rigor in measuring feature impact on business metrics",
      "Services with multiple concurrent feature flags where interactions between flags can cause unexpected behavior",
      "Regulated environments where every feature change must be auditable with before/after metric evidence"
    ],
    "when_to_use_zh": [
      "功能逐步发布且需在每个阶段监控影响的渐进式交付工作流",
      "需要统计严谨性来衡量功能对业务指标影响的A/B测试和实验计划",
      "具有多个并发功能开关且开关间交互可能导致意外行为的服务",
      "每次功能变更必须有前后指标证据可审计的受监管环境"
    ],
    "core_concepts": [
      "Flag-Aware Telemetry: Every trace, metric, and log entry carries the active feature flag variants as attributes, enabling filtering and grouping by flag state across all observability tools",
      "Metric Correlation: Statistical comparison of key metrics (error rate, latency, conversion) between flag-on and flag-off cohorts to isolate feature impact from background noise",
      "Automated Guardrails: Predefined metric thresholds per flag that trigger automatic rollback when a feature causes degradation beyond acceptable limits",
      "Flag Interaction Detection: Monitoring for unexpected metric changes when multiple flags are active simultaneously, since flag combinations can produce emergent behaviors not seen in isolation",
      "Experimentation Analytics: Applying statistical significance testing (chi-squared, t-test) to flag variant metrics to distinguish real feature impact from random variation"
    ],
    "core_concepts_zh": [
      "开关感知遥测：每个追踪、指标和日志条目携带激活的功能开关变体作为属性，支持跨所有可观测性工具按开关状态过滤和分组",
      "指标关联：对开关开启和关闭群组的关键指标（错误率、延迟、转化率）进行统计比较，从背景噪音中隔离功能影响",
      "自动护栏：每个开关的预定义指标阈值，当功能导致超出可接受限度的劣化时触发自动回滚",
      "开关交互检测：监控多个开关同时激活时的意外指标变化，因为开关组合可能产生孤立测试中未见的涌现行为",
      "实验分析：对开关变体指标应用统计显著性检验（卡方、t检验），区分真实功能影响与随机变异"
    ],
    "timeline": [
      [
        "2013",
        "LaunchDarkly launches with the vision of connecting feature flags to operational metrics"
      ],
      [
        "2017",
        "Progressive delivery movement formalizes the link between feature flags, canary deployments, and observability"
      ],
      [
        "2019",
        "Split.io pioneers feature experimentation platforms that combine flag management with statistical analysis"
      ],
      [
        "2022",
        "OpenFeature (CNCF) standardizes feature flag APIs with built-in hooks for telemetry emission"
      ],
      [
        "2024",
        "Major observability platforms (Datadog, Grafana) add native feature flag correlation to their trace and metric views"
      ]
    ],
    "timeline_zh": [
      [
        "2013",
        "LaunchDarkly以将功能开关与运营指标连接的愿景创立"
      ],
      [
        "2017",
        "渐进式交付运动正式化了功能开关、金丝雀部署和可观测性之间的联系"
      ],
      [
        "2019",
        "Split.io开创功能实验平台，将开关管理与统计分析相结合"
      ],
      [
        "2022",
        "OpenFeature（CNCF）标准化功能开关API，内置遥测发射钩子"
      ],
      [
        "2024",
        "主要可观测性平台（Datadog、Grafana）在追踪和指标视图中添加原生功能开关关联"
      ]
    ],
    "dos": [
      "Do emit flag evaluation events as first-class telemetry so every observability query can be filtered by flag variant",
      "Do define rollback thresholds before enabling a flag, because deciding what constitutes failure during an incident is too late",
      "Do monitor flag interactions when multiple flags are active, because two individually safe flags can produce harmful combinations",
      "Do track flag staleness and enforce cleanup, because unmonitored long-lived flags accumulate risk and observability blind spots"
    ],
    "dos_zh": [
      "将开关评估事件作为一等遥测发射，使每个可观测性查询都能按开关变体过滤",
      "在启用开关前定义回滚阈值，因为在事故期间决定什么构成失败已经太晚",
      "在多个开关同时激活时监控开关交互，因为两个单独安全的开关可能产生有害组合",
      "追踪开关过期状态并强制清理，因为未监控的长寿命开关会积累风险和可观测性盲区"
    ],
    "donts": [
      "Don't roll out feature flags without metric baselines, because you cannot detect regression without knowing what normal looks like",
      "Don't rely on manual dashboard watching for flag rollouts, because automated guardrails catch problems faster than human monitoring at 2am",
      "Don't ignore the observability cost of high-cardinality flag attributes, because attaching 50 flag variants to every span explodes storage and query costs",
      "Don't treat feature flag observability as optional for 'small' flags, because small changes cause big incidents when they hit unexpected code paths"
    ],
    "donts_zh": [
      "不要在没有指标基线的情况下发布功能开关，因为不知道正常状态就无法检测回归",
      "不要在开关发布时依赖手动仪表盘监看，因为自动护栏比凌晨2点的人类监控更快发现问题",
      "不要忽视高基数开关属性的可观测性成本，因为在每个Span上附加50个开关变体会使存储和查询成本爆炸",
      "不要将功能开关可观测性视为「小」开关的可选项，因为小变更在触及意外代码路径时会引发大事故"
    ],
    "case_study_company": "LaunchDarkly",
    "case_study": "LaunchDarkly's own platform exemplifies feature flag observability. When rolling out their real-time streaming architecture to replace polling-based flag delivery, they used their own flag-aware metrics to monitor latency, error rates, and flag evaluation consistency across both variants. By attaching flag state to every trace via OpenTelemetry, they detected a 15ms latency regression in the streaming path that only affected customers with more than 500 flags. The automated guardrail paused the rollout at 5% traffic, giving the team time to optimize the streaming protocol before resuming. Without flag-correlated observability, this long-tail performance issue would have reached 100% of customers.",
    "case_study_zh": "LaunchDarkly自己的平台是功能开关可观测性的典范。在推出实时流架构以替代基于轮询的开关分发时，他们使用自己的开关感知指标监控两个变体的延迟、错误率和开关评估一致性。通过OpenTelemetry将开关状态附加到每个追踪上，他们检测到流路径中仅影响拥有500个以上开关的客户的15毫秒延迟回归。自动护栏在5%流量时暂停了发布，给团队时间在恢复前优化流协议。如果没有开关关联的可观测性，这个长尾性能问题将影响100%的客户。",
    "when_not_to_use": [
      "Simple boolean kill-switches that are either fully on or fully off with no gradual rollout",
      "Feature flags used only in development/staging environments where production observability is irrelevant",
      "Systems with very low traffic where statistical comparison between flag cohorts lacks sufficient sample size",
      "Short-lived feature flags that will be removed within a single sprint before meaningful metric data accumulates"
    ],
    "when_not_to_use_zh": [
      "简单的布尔熔断开关，完全开启或完全关闭，没有渐进式发布",
      "仅在开发/预发布环境中使用的功能开关——生产可观测性不相关",
      "流量极低的系统——开关群组之间的统计比较缺乏足够样本量",
      "将在单个冲刺内移除、有意义的指标数据尚未积累的短寿命功能开关"
    ],
    "adopters": [
      "LaunchDarkly",
      "Split.io",
      "Spotify",
      "Atlassian",
      "Booking.com"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "observability",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Majors, C., Fong-Jones, L. & Miranda, G. (2022). \"Observability Engineering\". O'Reilly Media.",
    "secondary_sources": [
      "CNCF (2022). \"OpenFeature Specification\". openfeature.dev.",
      "Fitz, P. (2010). \"Feature Flags (Flippers)\". flickr.com (Flickr Engineering Blog)."
    ],
    "typed_relations": [
      {
        "slug": "feature-flags",
        "type": "extends"
      },
      {
        "slug": "opentelemetry",
        "type": "complement"
      },
      {
        "slug": "canary-deployment",
        "type": "complement"
      },
      {
        "slug": "slo-as-practice",
        "type": "complement"
      },
      {
        "slug": "red-method",
        "type": "complement"
      }
    ]
  },
  {
    "id": 159,
    "name": "Developer Portal (Backstage)",
    "name_zh": "开发者门户（Backstage）",
    "slug": "developer-portal-backstage",
    "category": "observability",
    "desc": "Centralized developer experience platform unifying service catalog, docs, and tooling",
    "desc_zh": "统一服务目录、文档和工具链的集中式开发者体验平台",
    "steps": [
      "Deploy Backstage as the single entry point for developer tooling: set up the core platform with authentication, plugin architecture, and organizational branding",
      "Build the software catalog: register every service, library, and data pipeline with ownership metadata, lifecycle stage, and dependency information",
      "Integrate existing tools via plugins: connect CI/CD (GitHub Actions, Jenkins), observability (Grafana, PagerDuty), cloud resources (Kubernetes, AWS), and documentation (TechDocs)",
      "Create software templates: build golden-path templates for new services, libraries, and pipelines that embed organizational standards from day one",
      "Measure developer experience: track portal adoption metrics (active users, template usage, time-to-first-deploy for new services) and iterate based on developer feedback"
    ],
    "steps_zh": [
      "将Backstage部署为开发者工具的单一入口：搭建核心平台，配置认证、插件架构和组织品牌",
      "构建软件目录：注册每个服务、库和数据管道，包含所有权元数据、生命周期阶段和依赖信息",
      "通过插件集成现有工具：连接CI/CD（GitHub Actions、Jenkins）、可观测性（Grafana、PagerDuty）、云资源（Kubernetes、AWS）和文档（TechDocs）",
      "创建软件模板：为新服务、库和管道构建「黄金路径」模板，从第一天起嵌入组织标准",
      "度量开发者体验：追踪门户采用指标（活跃用户、模板使用率、新服务首次部署时间），并根据开发者反馈迭代"
    ],
    "ai_relevant": true,
    "viz_type": "tree",
    "viz_labels": [
      "Deploy Portal",
      "Software Catalog",
      "Tool Plugins",
      "Service Templates",
      "DevEx Metrics"
    ],
    "viz_labels_zh": [
      "部署门户",
      "软件目录",
      "工具插件",
      "服务模板",
      "开发体验"
    ],
    "related": [
      "documentation-as-code",
      "dora-metrics",
      "opentelemetry",
      "slo-as-practice"
    ],
    "tags": [
      "observability",
      "developer-experience",
      "backstage",
      "service-catalog",
      "internal-developer-platform",
      "dx"
    ],
    "origin_author": "Spotify (Stefan Ålund, Niklas Ek), 2020",
    "origin_source": "Backstage.io documentation (Spotify, 2020); Software Engineering at Google (Winters, Manshreck, Wright, 2020)",
    "origin_source_zh": "Backstage.io文档（Spotify，2020）；《Google软件工程》（Winters、Manshreck、Wright，2020）",
    "complexity": "advanced",
    "when_to_use": [
      "Organizations with 50+ services where developers spend significant time finding information about other teams' services",
      "Engineering teams where onboarding a new developer takes weeks because tooling, docs, and service ownership are scattered",
      "Platform engineering teams building an internal developer platform and needing a unified frontend layer",
      "Companies standardizing service creation through golden-path templates to enforce best practices without manual review"
    ],
    "when_to_use_zh": [
      "拥有50+服务且开发者花大量时间查找其他团队服务信息的组织",
      "由于工具、文档和服务所有权分散，新开发者入职需要数周的工程团队",
      "正在构建内部开发者平台并需要统一前端层的平台工程团队",
      "通过黄金路径模板标准化服务创建、无需手动审查即可执行最佳实践的公司"
    ],
    "core_concepts": [
      "Software Catalog: A centralized registry of all software components (services, libraries, pipelines, websites) with ownership, lifecycle, and dependency metadata that answers 'who owns this and what depends on it'",
      "Golden Path Templates: Scaffolding templates that generate new services pre-configured with CI/CD, monitoring, logging, and security best practices, reducing setup from days to minutes",
      "Plugin Architecture: An extensible system where teams contribute plugins for their tools (Kubernetes, Grafana, PagerDuty, GitHub), creating a composable developer experience",
      "TechDocs: Documentation-as-code rendered directly in the portal alongside the service it describes, eliminating the 'where are the docs' problem",
      "Scorecards: Automated assessments that track each service's compliance with engineering standards (has SLOs, has runbooks, tests pass, docs updated) and surface gaps"
    ],
    "core_concepts_zh": [
      "软件目录：所有软件组件（服务、库、管道、网站）的集中注册中心，包含所有权、生命周期和依赖元数据，回答「这是谁的、什么依赖它」",
      "黄金路径模板：脚手架模板，生成预配置了CI/CD、监控、日志和安全最佳实践的新服务，将搭建时间从天缩短到分钟",
      "插件架构：可扩展系统，团队为其工具（Kubernetes、Grafana、PagerDuty、GitHub）贡献插件，创建可组合的开发者体验",
      "TechDocs：文档即代码，直接在门户中与其描述的服务一起渲染，消除「文档在哪里」的问题",
      "记分卡：自动化评估，追踪每个服务对工程标准的合规性（有SLO、有运维手册、测试通过、文档已更新）并显示差距"
    ],
    "timeline": [
      [
        "2016",
        "Spotify internally develops an internal developer portal to manage their 2,000+ microservices"
      ],
      [
        "2020",
        "Spotify open-sources Backstage and donates it to the CNCF as a sandbox project"
      ],
      [
        "2022",
        "Backstage reaches CNCF Incubating status with 100+ open-source plugins and 900+ adopting companies"
      ],
      [
        "2023",
        "Internal Developer Platform (IDP) and platform engineering become mainstream, with Backstage as the de facto frontend layer"
      ],
      [
        "2025",
        "Backstage ecosystem matures with AI-assisted service creation, automated scorecards, and deep observability integrations"
      ]
    ],
    "timeline_zh": [
      [
        "2016",
        "Spotify在内部开发内部开发者门户以管理其2000+微服务"
      ],
      [
        "2020",
        "Spotify将Backstage开源并捐赠给CNCF作为沙箱项目"
      ],
      [
        "2022",
        "Backstage达到CNCF孵化状态，拥有100+开源插件和900+采用公司"
      ],
      [
        "2023",
        "内部开发者平台（IDP）和平台工程成为主流，Backstage作为事实标准前端层"
      ],
      [
        "2025",
        "Backstage生态成熟，具备AI辅助服务创建、自动化记分卡和深度可观测性集成"
      ]
    ],
    "dos": [
      "Do start with the software catalog as the foundation, because knowing what exists and who owns it is the prerequisite for all other developer experience improvements",
      "Do invest in golden-path templates that embed your organization's standards, because templates scale best practices without requiring enforcement through review",
      "Do measure developer experience quantitatively (time-to-first-deploy, onboarding time, search success rate) because feelings-based DX assessment doesn't drive improvement",
      "Do treat the portal as a product with a dedicated team, because an unmaintained developer portal becomes another abandoned internal tool within 6 months"
    ],
    "dos_zh": [
      "以软件目录为基础开始，因为了解存在什么以及谁拥有它是所有其他开发者体验改进的前提",
      "投入黄金路径模板以嵌入组织标准，因为模板通过模板而非审查来扩展最佳实践",
      "定量度量开发者体验（首次部署时间、入职时间、搜索成功率），因为基于感觉的DX评估无法驱动改进",
      "将门户视为一个有专门团队的产品，因为无人维护的开发者门户在6个月内就会成为另一个被放弃的内部工具"
    ],
    "donts": [
      "Don't build a portal without solving the service catalog problem first, because a pretty frontend over scattered, inaccurate data is worse than no portal",
      "Don't mandate portal adoption without providing compelling value first, because forced migration without clear benefits breeds resentment and workarounds",
      "Don't try to build everything custom -- start with Backstage's plugin ecosystem, because the community has already solved most common integration challenges",
      "Don't let the software catalog become stale -- automate catalog registration from CI/CD and enforce ownership metadata as a deployment gate"
    ],
    "donts_zh": [
      "不要在解决服务目录问题之前构建门户，因为在分散且不准确的数据上建造漂亮的前端比没有门户更糟",
      "不要在提供令人信服的价值之前就强制门户采用，因为没有明确收益的强制迁移会滋生怨恨和变通方案",
      "不要尝试全部自建——从Backstage的插件生态开始，因为社区已经解决了大多数常见的集成挑战",
      "不要让软件目录过时——从CI/CD自动化目录注册，并将所有权元数据作为部署门控强制执行"
    ],
    "case_study_company": "Spotify",
    "case_study": "Spotify built Backstage to solve their developer experience crisis: with 2,000+ microservices owned by 200+ teams, developers were spending up to 60% of their cognitive load just finding the right information, dashboards, and documentation. After deploying Backstage with a comprehensive software catalog and golden-path templates, they reduced new service setup time from 3 days to 15 minutes and cut onboarding time for new engineers by 55%. The internal TechDocs integration ensured documentation lived alongside code and stayed current. Spotify's decision to open-source Backstage in 2020 made it the de facto standard for internal developer portals across the industry.",
    "case_study_zh": "Spotify构建Backstage以解决其开发者体验危机：拥有200+团队的2000+微服务，开发者将高达60%的认知负荷用于查找正确的信息、仪表盘和文档。部署包含完整软件目录和黄金路径模板的Backstage后，他们将新服务搭建时间从3天缩短到15分钟，新工程师入职时间缩短了55%。内部TechDocs集成确保文档与代码共存并保持最新。Spotify在2020年决定开源Backstage，使其成为行业内部开发者门户的事实标准。",
    "when_not_to_use": [
      "Small teams (fewer than 20 developers) where everyone already knows all the services and a portal adds unnecessary infrastructure",
      "Organizations with fewer than 10 services where a simple wiki or README provides sufficient service documentation",
      "Teams without dedicated platform engineering capacity to maintain and evolve the portal as a product",
      "Companies where all development happens in a single monorepo with excellent code search (the repo itself serves as the catalog)"
    ],
    "when_not_to_use_zh": [
      "每个人都已了解所有服务的小型团队（少于20名开发者）——门户增加不必要的基础设施",
      "服务少于10个的组织——简单的Wiki或README已能提供足够的服务文档",
      "没有专门的平台工程能力来将门户作为产品维护和演进的团队",
      "所有开发在单一单体仓库中进行且具有优秀代码搜索的公司——仓库本身即是目录"
    ],
    "adopters": [
      "Spotify",
      "Netflix",
      "Expedia",
      "HP",
      "IKEA"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "usability",
      "observability",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Spotify (2020). \"Backstage: An Open Platform for Building Developer Portals\". backstage.io.",
    "secondary_sources": [
      "Winters, T., Manshreck, T. & Wright, H. (2020). \"Software Engineering at Google\". O'Reilly Media.",
      "CNCF (2022). \"Backstage CNCF Incubation Proposal\". cncf.io."
    ],
    "typed_relations": [
      {
        "slug": "documentation-as-code",
        "type": "complement"
      },
      {
        "slug": "dora-metrics",
        "type": "complement"
      },
      {
        "slug": "opentelemetry",
        "type": "complement"
      },
      {
        "slug": "slo-as-practice",
        "type": "complement"
      }
    ]
  },
  {
    "id": 160,
    "name": "Documentation as Code",
    "name_zh": "文档即代码",
    "slug": "documentation-as-code",
    "category": "observability",
    "desc": "Treat documentation like software: version-controlled, tested, reviewed, and continuously deployed",
    "desc_zh": "将文档视为软件对待：版本控制、测试、审查和持续部署",
    "steps": [
      "Store documentation alongside source code: put docs in the same repository as the code they describe, using lightweight markup (Markdown, AsciiDoc, reStructuredText)",
      "Apply software engineering workflows to docs: require pull request reviews for doc changes, run CI checks (linting, link validation, spell check), and version docs with releases",
      "Automate documentation generation: extract API docs from code annotations (OpenAPI, JSDoc, docstrings), generate architecture diagrams from code, and build reference docs from types",
      "Deploy docs through CI/CD pipelines: publish documentation automatically on every merge to main, just like deploying application code, ensuring docs are always current",
      "Test documentation accuracy: write automated tests that verify code examples compile, API endpoints return expected responses, and configuration samples are valid"
    ],
    "steps_zh": [
      "将文档与源代码一起存储：将文档放在其描述代码的同一仓库中，使用轻量级标记语言（Markdown、AsciiDoc、reStructuredText）",
      "将软件工程工作流应用于文档：要求文档变更经过Pull Request审查，运行CI检查（代码检查、链接验证、拼写检查），并随发布进行版本管理",
      "自动化文档生成：从代码注解（OpenAPI、JSDoc、docstring）提取API文档，从代码生成架构图，从类型生成参考文档",
      "通过CI/CD管道部署文档：每次合并到主分支时自动发布文档，就像部署应用代码一样，确保文档始终最新",
      "测试文档准确性：编写自动化测试验证代码示例可编译、API端点返回预期响应、配置示例有效"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Docs in Repo",
      "Engineering Workflow",
      "Auto-Generate",
      "CI/CD Deploy",
      "Test Accuracy"
    ],
    "viz_labels_zh": [
      "文档入仓",
      "工程化流程",
      "自动生成",
      "CI/CD发布",
      "准确性测试"
    ],
    "related": [
      "developer-portal-backstage",
      "gitops",
      "twelve-factor-app"
    ],
    "tags": [
      "observability",
      "documentation",
      "docs-as-code",
      "developer-experience",
      "technical-writing",
      "ci-cd"
    ],
    "origin_author": "Anne Gentle (2012); popularized by Write the Docs community and Google technical writing team",
    "origin_source": "Software Engineering at Google (Winters, Manshreck, Wright, 2020), Ch. 10 'Documentation'; Docs Like Code (Anne Gentle, 2017)",
    "origin_source_zh": "《Google软件工程》（Winters、Manshreck、Wright，2020）第10章「文档」；《Docs Like Code》（Anne Gentle，2017）",
    "complexity": "beginner",
    "when_to_use": [
      "Engineering organizations where documentation chronically falls out of date because it lives separately from the code",
      "API-driven platforms where accurate, auto-generated API documentation is critical for internal and external consumers",
      "Teams adopting DevOps or GitOps practices who want documentation to follow the same review, test, and deploy workflow as code",
      "Open-source projects where documentation quality directly impacts contributor onboarding and community growth"
    ],
    "when_to_use_zh": [
      "文档因与代码分离存储而长期过时的工程组织",
      "准确的自动生成API文档对内外部消费者至关重要的API驱动平台",
      "采用DevOps或GitOps实践、希望文档遵循与代码相同的审查、测试和部署工作流的团队",
      "文档质量直接影响贡献者入职和社区增长的开源项目"
    ],
    "core_concepts": [
      "Docs-in-Repo: Documentation lives in the same repository as the code it describes, ensuring it is versioned, branched, and reviewed alongside code changes",
      "Doc Tests: Automated tests that verify documentation accuracy: code samples compile, API examples return expected responses, and configuration snippets are valid",
      "Generated Documentation: API references, type documentation, and architecture diagrams automatically extracted from source code, eliminating manual duplication",
      "Review Workflow: Documentation changes go through the same pull request process as code -- with designated reviewers, CI checks, and approval requirements",
      "Continuous Deployment: Docs are built and published automatically on every merge, ensuring the published documentation always matches the latest code"
    ],
    "core_concepts_zh": [
      "仓库内文档：文档与其描述的代码存放在同一仓库中，确保与代码变更一起进行版本控制、分支和审查",
      "文档测试：验证文档准确性的自动化测试：代码示例可编译、API示例返回预期响应、配置片段有效",
      "生成式文档：从源代码自动提取API参考、类型文档和架构图，消除手动重复",
      "审查工作流：文档变更经过与代码相同的Pull Request流程——有指定审查者、CI检查和审批要求",
      "持续部署：每次合并时自动构建和发布文档，确保已发布文档始终匹配最新代码"
    ],
    "timeline": [
      [
        "2012",
        "Anne Gentle coins 'docs like code' and begins advocating for applying software workflows to technical writing"
      ],
      [
        "2015",
        "Write the Docs community grows rapidly, establishing docs-as-code as a movement with annual conferences"
      ],
      [
        "2017",
        "Anne Gentle publishes Docs Like Code, the first book dedicated to the practice"
      ],
      [
        "2020",
        "Software Engineering at Google dedicates Chapter 10 to documentation, emphasizing code-adjacent docs and automated freshness"
      ],
      [
        "2023",
        "AI-assisted documentation (Copilot for docs, auto-generated changelogs) accelerates the docs-as-code workflow"
      ]
    ],
    "timeline_zh": [
      [
        "2012",
        "Anne Gentle提出「docs like code」概念，开始倡导将软件工作流应用于技术写作"
      ],
      [
        "2015",
        "Write the Docs社区快速壮大，通过年度会议将文档即代码确立为一场运动"
      ],
      [
        "2017",
        "Anne Gentle出版《Docs Like Code》，首本专注于该实践的书籍"
      ],
      [
        "2020",
        "《Google软件工程》第10章专门讨论文档，强调代码邻近文档和自动化新鲜度"
      ],
      [
        "2023",
        "AI辅助文档（Copilot for docs、自动生成变更日志）加速文档即代码工作流"
      ]
    ],
    "dos": [
      "Do store docs in the same repo as the code they describe, because co-located docs are far more likely to be updated when the code changes",
      "Do add documentation CI checks (broken link detection, markdown linting, code sample compilation) to catch doc rot early",
      "Do generate API documentation from code annotations rather than writing it manually, because generated docs cannot drift from the implementation",
      "Do include documentation updates as a required part of the code review checklist, because docs-as-code only works when the team treats docs as first-class"
    ],
    "dos_zh": [
      "将文档存储在其描述代码的同一仓库中，因为同位文档在代码变更时被更新的可能性远高于外部文档",
      "添加文档CI检查（断链检测、Markdown检查、代码示例编译），以尽早捕获文档腐烂",
      "从代码注解生成API文档而非手动编写，因为生成的文档不会与实现产生偏差",
      "将文档更新作为代码审查清单的必需部分，因为文档即代码只有在团队将文档视为一等公民时才能奏效"
    ],
    "donts": [
      "Don't maintain documentation in a separate wiki disconnected from the code repository, because disconnected docs become stale within weeks",
      "Don't write documentation only at the end of a project, because retroactive documentation is always incomplete and often inaccurate",
      "Don't generate docs without human-written conceptual guides, because auto-generated API references without context are useless to newcomers",
      "Don't skip doc reviews in pull requests, because unreviewed documentation accumulates errors just as unreviewed code accumulates bugs"
    ],
    "donts_zh": [
      "不要在与代码仓库断开的独立Wiki中维护文档，因为断开的文档几周内就会过时",
      "不要仅在项目结束时编写文档，因为回顾性文档总是不完整且常常不准确",
      "不要在没有人类编写的概念指南的情况下生成文档，因为没有上下文的自动生成API参考对新手毫无用处",
      "不要在Pull Request中跳过文档审查，因为未审查的文档会积累错误，就像未审查的代码积累缺陷"
    ],
    "case_study_company": "Google",
    "case_study": "Google's engineering culture treats documentation as a first-class engineering artifact. As described in Software Engineering at Google, every code change that affects behavior is expected to include documentation updates in the same changelist. Google's internal documentation platform (g3doc) renders Markdown docs stored alongside code in their monorepo, automatically checking for freshness and flagging stale pages. They found that documentation written by the same engineer who wrote the code, reviewed in the same code review, was 3x more likely to be accurate than documentation written after the fact by a separate technical writer. This practice scaled to 30,000+ engineers across the company.",
    "case_study_zh": "Google的工程文化将文档视为一等工程制品。如《Google软件工程》所述，每个影响行为的代码变更都应在同一变更列表中包含文档更新。Google的内部文档平台（g3doc）渲染存储在单体仓库中的Markdown文档，自动检查新鲜度并标记过时页面。他们发现由编写代码的同一工程师撰写、在同一代码审查中评审的文档，其准确性比事后由独立技术作者编写的文档高3倍。这一实践扩展到公司的30000+工程师。",
    "when_not_to_use": [
      "Non-technical audience documentation (marketing content, user guides for non-developers) where a CMS provides a better authoring experience",
      "Highly visual documentation (design specs, wireframes) where WYSIWYG tools are essential and Markdown is too limiting",
      "Regulated industries requiring formal document management with approval workflows that exceed what Git provides",
      "Small personal projects where the overhead of CI pipelines and review workflows for docs is disproportionate"
    ],
    "when_not_to_use_zh": [
      "面向非技术受众的文档（营销内容、非开发者用户指南）——CMS提供更好的创作体验",
      "高度可视化的文档（设计规范、线框图）——WYSIWYG工具不可或缺且Markdown过于受限",
      "要求正式文档管理和超出Git能力的审批工作流的受监管行业",
      "文档的CI管道和审查工作流开销不成比例的小型个人项目"
    ],
    "adopters": [
      "Google",
      "Stripe",
      "GitLab",
      "Microsoft",
      "Cloudflare"
    ],
    "abstraction_level": "organization",
    "quality_concerns": [
      "maintainability",
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Gentle, A. (2017). \"Docs Like Code\". justwriteclick.com.",
    "secondary_sources": [
      "Winters, T., Manshreck, T. & Wright, H. (2020). \"Software Engineering at Google\", Ch. 10. O'Reilly Media.",
      "Eby, T. & Bhatti, Z. (2023). \"Docs-as-Code Tools and Workflows\". Write the Docs Community."
    ],
    "typed_relations": [
      {
        "slug": "developer-portal-backstage",
        "type": "complement"
      },
      {
        "slug": "gitops",
        "type": "complement"
      },
      {
        "slug": "twelve-factor-app",
        "type": "related"
      }
    ]
  },
  {
    "id": 220,
    "name": "Anomaly Detection Patterns",
    "name_zh": "异常检测模式",
    "slug": "anomaly-detection-patterns",
    "category": "observability",
    "desc": "ML-based and statistical anomaly detection for metrics, logs, and traces in production systems",
    "desc_zh": "生产系统中针对指标、日志和追踪的基于ML和统计的异常检测",
    "steps": [
      "Define what constitutes an anomaly for each signal type: point anomalies (single outlier values), contextual anomalies (normal value at abnormal time), and collective anomalies (pattern of individually normal values that together are abnormal)",
      "Select the appropriate detection algorithm for each metric's characteristics: Z-score or moving average for stable metrics, seasonal decomposition (STL) for time-series with known periodicity, and Isolation Forest or LSTM for multivariate or complex patterns",
      "Train baselines during representative traffic periods and tune sensitivity (false positive vs. false negative trade-off) per signal, erring toward fewer high-confidence alerts over many low-confidence ones",
      "Integrate anomaly scores into the alerting pipeline as a supplementary signal alongside threshold-based alerts, not as a replacement, to catch regressions that don't cross fixed thresholds",
      "Continuously validate the detector by correlating anomaly detections with confirmed incidents and tracking false positive rate; retrain models when traffic patterns change (new features, seasonality shifts)"
    ],
    "steps_zh": [
      "为每种信号类型定义什么构成异常：点异常（单个离群值）、上下文异常（在异常时间的正常值）和集体异常（单独正常值组合在一起的异常模式）",
      "为每个指标的特征选择适当的检测算法：对于稳定指标使用Z分数或移动平均，对于已知周期性的时间序列使用季节分解（STL），对于多变量或复杂模式使用Isolation Forest或LSTM",
      "在代表性流量期间训练基线，并为每个信号调整敏感度（误报与漏报权衡），倾向于较少的高置信度告警而不是许多低置信度告警",
      "将异常分数作为补充信号集成到告警流水线中，与基于阈值的告警并行，而不是替代，以捕获不超过固定阈值的回归",
      "通过将异常检测与已确认的事故关联并跟踪误报率来持续验证检测器；当流量模式变化时重新训练模型（新功能、季节性变化）"
    ],
    "ai_relevant": true,
    "viz_type": "matrix",
    "viz_labels": [
      "Point Anomaly",
      "Detection Algorithm",
      "Baseline Training",
      "Alert Pipeline",
      "Validate Retrain"
    ],
    "viz_labels_zh": [
      "点异常",
      "检测算法",
      "基线训练",
      "告警管道",
      "持续验证"
    ],
    "related": [
      "structured-logging",
      "slo-as-practice",
      "feature-flag-observability"
    ],
    "tags": [
      "anomaly-detection",
      "ml-ops",
      "statistical",
      "alerting",
      "time-series"
    ],
    "origin_author": "Chandola, Banerjee & Kumar (2009, ACM Computing Surveys); productized by Datadog Watchdog (2018) and Netflix Argos",
    "origin_source": "Chandola, V., Banerjee, A. & Kumar, V. (2009). \"Anomaly Detection: A Survey\". ACM Computing Surveys, 41(3).",
    "origin_source_zh": "Chandola, V., Banerjee, A. & Kumar, V.（2009）「异常检测：综述」，ACM计算调查，41(3)",
    "complexity": "advanced",
    "when_to_use": [
      "When threshold-based alerting is producing excessive false positives because traffic patterns vary by time of day, day of week, or seasonal events",
      "When incidents are being missed because the failure mode produces gradual degradation that never crosses a fixed threshold until it becomes catastrophic",
      "When the system has enough historical data (6+ months) to establish stable seasonal baselines for detection",
      "When the observability platform has metrics from hundreds of services and manual threshold management is no longer feasible"
    ],
    "when_to_use_zh": [
      "当基于阈值的告警产生过多误报，因为流量模式随时间、星期几或季节性事件变化时",
      "当由于故障模式产生逐渐降级而错过事故，这种降级从不超过固定阈值直到变成灾难性时",
      "当系统有足够的历史数据（6个月以上）来为检测建立稳定的季节性基线时",
      "当可观测性平台有来自数百个服务的指标，手动阈值管理不再可行时"
    ],
    "core_concepts": [
      "Point Anomaly: A single data point that deviates significantly from expected values — the simplest case, detectable with Z-score or IQR methods on stable metrics",
      "Contextual Anomaly: A data point that is anomalous only given its temporal context — e.g., 50 RPS is normal at 3am but anomalous at 2pm for the same service",
      "Seasonal Decomposition (STL): Separates a time series into trend, seasonal, and residual components; anomalies are detected in the residual component after removing expected patterns",
      "Isolation Forest: An ensemble tree-based method that isolates anomalies by randomly partitioning feature space; anomalous points require fewer partitions to isolate",
      "Sensitivity Tuning: The threshold on anomaly score above which an alert is fired; lower thresholds increase recall (catch more anomalies) but decrease precision (more false positives)"
    ],
    "core_concepts_zh": [
      "点异常：显著偏离预期值的单个数据点——最简单的情况，可以用稳定指标上的Z分数或IQR方法检测",
      "上下文异常：仅在其时间背景下才异常的数据点——例如，同一服务在凌晨3点50 RPS是正常的，但在下午2点是异常的",
      "季节分解（STL）：将时间序列分解为趋势、季节和残差分量；在去除预期模式后在残差分量中检测异常",
      "Isolation Forest：一种基于集成树的方法，通过随机划分特征空间来隔离异常；异常点需要更少的划分来隔离",
      "敏感度调整：触发告警的异常分数阈值；较低的阈值提高召回率（捕获更多异常）但降低精确率（更多误报）"
    ],
    "timeline": [
      [
        "2009",
        "Chandola, Banerjee & Kumar publish the seminal anomaly detection survey in ACM Computing Surveys, classifying detection methods for the first time"
      ],
      [
        "2015",
        "Netflix open-sources Argos and Robust PCA-based anomaly detection for microservice metrics, influencing the industry"
      ],
      [
        "2018",
        "Datadog launches Watchdog, an always-on ML-based anomaly detection layer that surfaces anomalies without requiring manual threshold configuration"
      ],
      [
        "2020",
        "Amazon DevOps Guru and Azure Anomaly Detector launch, making cloud-native ML-based anomaly detection a managed service"
      ],
      [
        "2023",
        "LLM-based log anomaly detection (LogGPT, AIOps platforms) emerges, enabling natural language explanation of detected anomalies alongside scores"
      ]
    ],
    "timeline_zh": [
      [
        "2009",
        "Chandola、Banerjee & Kumar在ACM计算调查上发表开创性异常检测综述，首次对检测方法进行分类"
      ],
      [
        "2015",
        "Netflix开源Argos和基于鲁棒PCA的微服务指标异常检测，影响了整个行业"
      ],
      [
        "2018",
        "Datadog推出Watchdog，一个始终在线的基于ML的异常检测层，无需手动阈值配置即可发现异常"
      ],
      [
        "2020",
        "Amazon DevOps Guru和Azure Anomaly Detector推出，使云原生基于ML的异常检测成为托管服务"
      ],
      [
        "2023",
        "基于LLM的日志异常检测（LogGPT、AIOps平台）出现，支持在分数旁边用自然语言解释检测到的异常"
      ]
    ],
    "dos": [
      "Do combine anomaly detection with threshold-based alerts because anomaly detection catches gradual regressions while thresholds catch sudden catastrophic failures — they complement each other",
      "Do validate anomaly detectors on historical incident data before enabling them in production to measure precision and recall against known events",
      "Do separate anomaly detection models per service or metric category because a single global model will be dominated by high-traffic services and miss anomalies in low-traffic ones",
      "Do set a minimum data requirement (e.g., 4+ weeks of stable history) before enabling anomaly detection to prevent models from training on insufficient or misleading baselines"
    ],
    "dos_zh": [
      "将异常检测与基于阈值的告警结合，因为异常检测捕获渐进回归，而阈值捕获突发灾难性失败——它们相互补充",
      "在生产中启用异常检测器之前，在历史事故数据上进行验证，以衡量对已知事件的精确率和召回率",
      "按服务或指标类别分离异常检测模型，因为单个全局模型将被高流量服务主导，并错过低流量服务中的异常",
      "在启用异常检测之前设置最低数据要求（如至少4周的稳定历史），以防止模型在不足或误导性基线上训练"
    ],
    "donts": [
      "Don't treat anomaly detection as a replacement for SLO-based alerting because SLOs directly measure user impact while anomaly detection finds statistical deviations that may not matter to users",
      "Don't deploy anomaly detection without human review of initial alerts because newly trained models always have a burn-in period of high false positive rates",
      "Don't apply anomaly detection to metrics with fewer than a few hundred data points because statistical models require sufficient data to establish meaningful baselines",
      "Don't ignore seasonality when building detection models because a model unaware of weekly traffic patterns will fire alerts every Monday morning when traffic ramps up"
    ],
    "donts_zh": [
      "不要将异常检测视为基于SLO告警的替代品，因为SLO直接衡量用户影响，而异常检测发现可能对用户不重要的统计偏差",
      "不要在没有人工审查初始告警的情况下部署异常检测，因为新训练的模型总是有一个高误报率的磨合期",
      "不要对少于几百个数据点的指标应用异常检测，因为统计模型需要足够的数据来建立有意义的基线",
      "不要在构建检测模型时忽略季节性，因为不了解每周流量模式的模型会在每个周一早上流量增加时触发告警"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix developed its Argos anomaly detection system to monitor thousands of microservice metrics simultaneously. Before Argos, SREs maintained thousands of manual thresholds that became obsolete with every traffic pattern change. Argos applies a robust PCA decomposition to separate expected variance from anomalous variance, reducing alert noise by 72% while catching 15% more incidents than threshold-based alerting alone. The system processes over 2 million metric data points per minute across Netflix's microservice fleet and automatically surfaces the top anomalies ranked by impact score, allowing on-call engineers to investigate the most important signals first.",
    "case_study_zh": "Netflix开发了其Argos异常检测系统，同时监控数千个微服务指标。在Argos之前，SRE维护数千个手动阈值，这些阈值随每次流量模式变化而过时。Argos应用鲁棒PCA分解将预期方差与异常方差分离，将告警噪声减少了72%，同时比单独的基于阈值的告警多捕获15%的事故。该系统每分钟在Netflix微服务群中处理超过200万个指标数据点，并自动按影响分数排序呈现最重要的异常，允许值班工程师首先调查最重要的信号。",
    "when_not_to_use": [
      "New services with fewer than 4-6 weeks of production data where baselines cannot be reliably established",
      "Services with highly irregular traffic patterns that have no repeating seasonality, where seasonal models will produce constant false positives",
      "Simple services with a single critical metric (e.g., queue depth) where a threshold alert is more interpretable and equally effective",
      "Teams that are not yet ready to investigate and tune false positives because unmaintained anomaly detectors quickly become noise generators that are silently ignored"
    ],
    "when_not_to_use_zh": [
      "生产数据少于4-6周的新服务，基线无法可靠建立",
      "具有高度不规则流量模式且没有重复季节性的服务，季节性模型会产生持续误报",
      "具有单一关键指标（如队列深度）的简单服务，阈值告警更易解释且同样有效",
      "尚未准备好调查和调整误报的团队，因为未维护的异常检测器很快成为被静默忽视的噪声生成器"
    ],
    "adopters": [
      "Netflix",
      "Datadog",
      "LinkedIn",
      "Uber",
      "Amazon"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "reliability",
      "performance",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Chandola, V., Banerjee, A. & Kumar, V. (2009). \"Anomaly Detection: A Survey\". ACM Computing Surveys, 41(3), Article 15.",
    "secondary_sources": [
      "Laptev, N. et al. (2015). \"Generic and Scalable Framework for Automated Time-series Anomaly Detection\". KDD 2015.",
      "Datadog (2018). \"Introducing Watchdog: Automated Anomaly Detection for Metrics\". datadoghq.com.",
      "Chen, M. et al. (2019). \"AIOps: Real-World Challenges and Research Innovations\". IEEE Software."
    ],
    "typed_relations": [
      {
        "slug": "structured-logging",
        "type": "complement"
      },
      {
        "slug": "slo-as-practice",
        "type": "complement"
      },
      {
        "slug": "feature-flag-observability",
        "type": "complement"
      }
    ]
  },
  {
    "id": 221,
    "name": "Continuous Profiling",
    "name_zh": "持续性能分析",
    "slug": "continuous-profiling",
    "category": "observability",
    "desc": "Always-on production profiling using pprof, Pyroscope, and Datadog Continuous Profiler",
    "desc_zh": "使用pprof、Pyroscope和Datadog持续性能分析器进行始终开启的生产性能分析",
    "steps": [
      "Select a continuous profiling platform compatible with your language runtime: pprof for Go/C++, async-profiler for JVM, py-spy for Python, perf for native code, or a managed service (Pyroscope, Datadog Continuous Profiler, Grafana Phlare)",
      "Instrument the application with a low-overhead sampling profiler (typically 1-10% CPU overhead) using sampling rates of 100Hz or lower so that profiling data can be collected continuously in production without significant performance impact",
      "Attach profile metadata (service name, version, environment, region, pod ID) to each profile so that flamegraphs can be filtered and compared across deployments, versions, and infrastructure dimensions",
      "Integrate profile data with your existing observability stack: correlate high-latency traces with CPU profiles collected at the same time window, and link profiling dashboards from your APM tool",
      "Use differential flamegraph comparison between two time windows or versions to identify performance regressions introduced by recent deployments"
    ],
    "steps_zh": [
      "选择与您的语言运行时兼容的持续性能分析平台：Go/C++使用pprof，JVM使用async-profiler，Python使用py-spy，原生代码使用perf，或使用托管服务（Pyroscope、Datadog持续性能分析器、Grafana Phlare）",
      "使用低开销采样性能分析器（通常1-10%的CPU开销）对应用程序进行仪器化，使用100Hz或更低的采样率，以便可以在生产中持续收集性能分析数据而不产生显著性能影响",
      "将配置文件元数据（服务名称、版本、环境、区域、Pod ID）附加到每个配置文件，以便可以跨部署、版本和基础设施维度过滤和比较火焰图",
      "将配置文件数据与现有可观测性栈集成：将高延迟追踪与同一时间窗口收集的CPU配置文件关联，并从APM工具链接性能分析仪表板",
      "使用两个时间窗口或版本之间的差分火焰图比较，以识别最近部署引入的性能回归"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Profiler Select",
      "Low-Overhead Sampling",
      "Profile Metadata",
      "Trace Correlate",
      "Diff Flamegraph"
    ],
    "viz_labels_zh": [
      "分析器选择",
      "低开销采样",
      "性能元数据",
      "与追踪关联",
      "差量火焰图"
    ],
    "related": [
      "distributed-tracing",
      "opentelemetry",
      "slo-as-practice"
    ],
    "tags": [
      "profiling",
      "flamegraph",
      "pprof",
      "pyroscope",
      "performance"
    ],
    "origin_author": "Brendan Gregg (flamegraph inventor, 2011); Go pprof (Google, 2012); Pyroscope (2020)",
    "origin_source": "Gregg, B. (2016). \"Systems Performance\", Ch. 2. Prentice Hall; Gregg, B. (2011). \"Flame Graphs\". brendangregg.com.",
    "origin_source_zh": "Gregg, B.（2016）「系统性能」，第2章，Prentice Hall；Gregg, B.（2011）「火焰图」，brendangregg.com",
    "complexity": "advanced",
    "when_to_use": [
      "When distributed traces show high latency but don't point to a specific external call — the bottleneck is CPU-bound computation within a service rather than downstream I/O",
      "When infrastructure costs are rising unexpectedly and the team needs to identify which functions are consuming the most CPU or memory across the fleet",
      "When a performance regression is suspected after a deployment but APM metrics and logs don't provide sufficient resolution to identify the changed code path",
      "When optimizing ML inference or data processing pipelines where function-level CPU profiling can identify the top-5 hot paths that dominate runtime"
    ],
    "when_to_use_zh": [
      "当分布式追踪显示高延迟但不指向特定的外部调用时——瓶颈是服务内的CPU密集型计算而不是下游I/O",
      "当基础设施成本意外上升，团队需要识别哪些函数在整个集群中消耗最多CPU或内存时",
      "当部署后怀疑存在性能回归，但APM指标和日志没有提供足够的分辨率来识别更改的代码路径时",
      "当优化ML推断或数据处理流水线时，函数级CPU性能分析可以识别主导运行时的前5个热路径"
    ],
    "core_concepts": [
      "Flamegraph: A visualization of stack traces where the x-axis represents the total time spent in a function (width) and the y-axis represents the call stack depth; the widest frames are the biggest performance contributors",
      "Sampling Profiler: Interrupts the process at a fixed frequency (e.g., 100Hz), captures the current call stack, and aggregates samples over time; produces statistically accurate profiles with minimal overhead (~1-5%)",
      "Instrumentation Profiler: Injects timing code into every function call; produces exact measurements but incurs significant overhead (10-100x slowdown) that makes it unsuitable for production use",
      "Differential Flamegraph: A comparison of two flamegraphs (before vs. after a deployment) where blue frames represent performance improvements and red frames represent regressions",
      "Always-On Profiling: Unlike traditional profiling triggered during incidents, continuous profiling collects profiles at all times so that data is available retrospectively when investigating an issue"
    ],
    "core_concepts_zh": [
      "火焰图：堆栈跟踪的可视化，x轴表示函数花费的总时间（宽度），y轴表示调用堆栈深度；最宽的帧是最大的性能贡献者",
      "采样性能分析器：以固定频率（如100Hz）中断进程，捕获当前调用堆栈，并随时间聚合样本；以最小开销（约1-5%）产生统计上准确的配置文件",
      "插桩性能分析器：将计时代码注入每个函数调用；产生精确测量，但会产生显著开销（10-100倍减速），使其不适合生产使用",
      "差分火焰图：两个火焰图的比较（部署前后），蓝色帧表示性能改进，红色帧表示回归",
      "始终开启的性能分析：与传统的在事故期间触发的性能分析不同，持续性能分析始终收集配置文件，以便在调查问题时可以回溯获取数据"
    ],
    "timeline": [
      [
        "2011",
        "Brendan Gregg invents the flamegraph visualization at Sun Microsystems, revolutionizing how engineers interpret profiling data"
      ],
      [
        "2012",
        "Go 1.0 ships with pprof integration, making heap and CPU profiling a first-class feature of the Go standard library"
      ],
      [
        "2018",
        "Datadog Continuous Profiler launches in beta, becoming one of the first commercial always-on production profiling services"
      ],
      [
        "2020",
        "Pyroscope open-sourced by Grafana Labs predecessor; Grafana acquires and integrates it as Grafana Phlare in 2022"
      ],
      [
        "2024",
        "OpenTelemetry Profiling Signal reaches beta, standardizing profile data collection alongside traces, metrics, and logs in a unified observability pipeline"
      ]
    ],
    "timeline_zh": [
      [
        "2011",
        "Brendan Gregg在Sun Microsystems发明火焰图可视化，彻底改变了工程师解读性能分析数据的方式"
      ],
      [
        "2012",
        "Go 1.0附带pprof集成，使堆和CPU性能分析成为Go标准库的一级特性"
      ],
      [
        "2018",
        "Datadog持续性能分析器以beta版推出，成为最早的商业始终开启生产性能分析服务之一"
      ],
      [
        "2020",
        "Pyroscope由Grafana Labs前身开源；Grafana于2022年收购并将其集成为Grafana Phlare"
      ],
      [
        "2024",
        "OpenTelemetry性能分析信号达到beta，在统一的可观测性流水线中标准化配置文件数据收集，与追踪、指标和日志并列"
      ]
    ],
    "dos": [
      "Do attach deployment metadata (version, commit SHA, environment) to profiles so that differential flamegraphs can immediately isolate the code change responsible for a regression",
      "Do correlate profiles with traces by linking high-latency trace spans to CPU profiles collected in the same time window using trace ID as the correlation key",
      "Do store profiles with sufficient retention (30+ days) to enable retrospective investigation of production incidents without requiring a live reproduction",
      "Do review profiling data during capacity planning cycles to identify optimization opportunities that can defer infrastructure scaling"
    ],
    "dos_zh": [
      "将部署元数据（版本、提交SHA、环境）附加到配置文件，以便差分火焰图可以立即隔离导致回归的代码变更",
      "通过使用追踪ID作为关联键将高延迟追踪span与同一时间窗口收集的CPU配置文件链接，将配置文件与追踪关联",
      "以足够的保留期（30天以上）存储配置文件，以便在不需要实时重现的情况下对生产事故进行回溯调查",
      "在容量规划周期中审查性能分析数据，以识别可以推迟基础设施扩展的优化机会"
    ],
    "donts": [
      "Don't use instrumentation profilers (full function tracing) in production because the overhead will cause the profiler to become the primary performance bottleneck",
      "Don't profile only during incidents because you need historical baselines to understand what 'normal' looks like for meaningful comparison",
      "Don't discard profiling data after 7 days because performance regressions are often not noticed until weeks after a deployment when the cumulative cost becomes visible",
      "Don't treat profiling as a reactive tool used only by performance engineers — instrument all production services by default and democratize access to flamegraphs for all engineers"
    ],
    "donts_zh": [
      "不要在生产中使用插桩性能分析器（完整函数追踪），因为开销会导致性能分析器本身成为主要性能瓶颈",
      "不要只在事故期间进行性能分析，因为您需要历史基线来理解「正常」是什么样的以进行有意义的比较",
      "不要在7天后丢弃性能分析数据，因为性能回归通常在部署后数周才被注意到，当累积成本变得可见时",
      "不要将性能分析视为仅由性能工程师使用的反应性工具——默认对所有生产服务进行仪器化，并为所有工程师开放火焰图访问"
    ],
    "case_study_company": "Shopify",
    "case_study": "Shopify deployed continuous profiling across its Ruby on Rails monolith using Datadog Continuous Profiler, attaching profile data to every deployment event. After a Black Friday traffic spike caused unexpected CPU saturation, engineers used differential flamegraphs to compare profiles from the week before and after the previous major release. The analysis revealed that a seemingly innocuous change to the cart serialization logic had increased CPU usage by 18% due to a missing memoization. The fix was deployed within 4 hours of identifying the flamegraph regression. Without always-on profiling, the investigation would have required reproducing the issue in a load test environment, which Shopify estimated would take 2-3 days.",
    "case_study_zh": "Shopify使用Datadog持续性能分析器在其Ruby on Rails单体中部署了持续性能分析，将配置文件数据附加到每个部署事件。在黑色星期五流量峰值导致意外的CPU饱和后，工程师使用差分火焰图比较上一个主要版本前后一周的配置文件。分析揭示了一个看似无害的购物车序列化逻辑变更，由于缺少记忆化导致CPU使用率增加了18%。在识别火焰图回归后4小时内部署了修复。如果没有始终开启的性能分析，调查将需要在负载测试环境中重现问题，Shopify估计这将需要2-3天。",
    "when_not_to_use": [
      "Serverless functions with execution durations under 100ms where profiling overhead per invocation exceeds the function's runtime",
      "Services with fewer than 5 RPS where sampling profilers produce too few samples to generate statistically meaningful flamegraphs",
      "Organizations that have not yet established basic metrics and tracing observability, where profiling would address symptoms before identifying the root cause signals",
      "Early-stage prototypes where the code is rewritten frequently and profile data becomes stale before engineers can act on it"
    ],
    "when_not_to_use_zh": [
      "执行时长低于100ms的无服务器函数，每次调用的性能分析开销超过函数运行时",
      "RPS低于5的服务，采样性能分析器产生的样本太少，无法生成统计上有意义的火焰图",
      "尚未建立基本指标和追踪可观测性的组织，在识别根本原因信号之前，性能分析会解决症状",
      "早期原型，代码被频繁重写，工程师采取行动之前配置文件数据已过时"
    ],
    "adopters": [
      "Shopify",
      "Google",
      "Datadog",
      "Grafana Labs",
      "Cloudflare"
    ],
    "abstraction_level": "component",
    "quality_concerns": [
      "performance",
      "reliability",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Gregg, B. (2011). \"Flame Graphs\". brendangregg.com; Gregg, B. (2016). \"Systems Performance\", Ch. 2. Prentice Hall.",
    "secondary_sources": [
      "Kalman, G. et al. (2020). \"Pyroscope: Open Source Continuous Profiling\". pyroscope.io.",
      "Datadog (2018). \"Continuous Profiler Documentation\". docs.datadoghq.com.",
      "OpenTelemetry (2024). \"Profiling Signal Specification\". opentelemetry.io."
    ],
    "typed_relations": [
      {
        "slug": "distributed-tracing",
        "type": "complement"
      },
      {
        "slug": "opentelemetry",
        "type": "complement"
      },
      {
        "slug": "slo-as-practice",
        "type": "complement"
      }
    ]
  },
  {
    "id": 222,
    "name": "Incident Management Framework",
    "name_zh": "事故管理框架",
    "slug": "incident-management-framework",
    "category": "observability",
    "desc": "Structured incident response process covering detection, triage, resolution, and blameless retrospective",
    "desc_zh": "涵盖检测、分类、解决和无指责复盘的结构化事故响应流程",
    "steps": [
      "Establish incident severity levels (SEV1-SEV4) with explicit criteria (user impact, revenue risk, data loss) and associated response time SLAs so that all engineers classify incidents consistently",
      "Define the Incident Commander role: one person owns the incident, coordinates responders, manages external communication, and prevents parallel investigation threads from fragmenting focus",
      "Follow the five-phase response cycle: Detect (alert fires or user report), Triage (confirm severity and assemble responders), Investigate (identify scope and root cause), Resolve (apply fix and verify recovery), and Retrospect (blameless post-incident review)",
      "Maintain a live incident timeline in a shared document (PagerDuty, Slack incident channel, Confluence) throughout the incident, recording every action with a timestamp so that the timeline is accurate for the post-mortem",
      "Conduct a blameless post-incident review within 5 business days using the five-whys method, identifying systemic contributing factors, and committing to a specific set of action items with owners and due dates"
    ],
    "steps_zh": [
      "建立具有明确标准（用户影响、收入风险、数据丢失）和相关响应时间SLA的事故严重性级别（SEV1-SEV4），使所有工程师一致地分类事故",
      "定义事故指挥官角色：一个人拥有事故、协调响应者、管理外部沟通，并防止并行调查线程分散注意力",
      "遵循五阶段响应周期：检测（告警触发或用户报告）、分类（确认严重性并集合响应者）、调查（识别范围和根本原因）、解决（应用修复并验证恢复），以及复盘（无指责事后审查）",
      "在整个事故期间在共享文档（PagerDuty、Slack事故频道、Confluence）中维护实时事故时间线，记录每个带时间戳的行动，以便时间线对事后分析准确",
      "在5个工作日内使用五个为什么方法进行无指责事后审查，识别系统性促成因素，并承诺具有所有者和截止日期的特定行动项目集"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Severity Levels",
      "Incident Commander",
      "Response Cycle",
      "Live Timeline",
      "Post-Incident Review"
    ],
    "viz_labels_zh": [
      "严重等级",
      "事件指挥官",
      "响应周期",
      "实时时间线",
      "事后复盘"
    ],
    "related": [
      "blameless-postmortems",
      "runbook-automation",
      "on-call-rotation-design"
    ],
    "tags": [
      "incident-management",
      "sre",
      "on-call",
      "postmortem",
      "response"
    ],
    "origin_author": "Google SRE framework (2003); PagerDuty Incident Response Guide (2016); Atlassian Incident Management handbook (2018)",
    "origin_source": "Beyer, B. et al. (2016). \"Site Reliability Engineering\", Ch. 14. Google / O'Reilly; PagerDuty (2016). \"Incident Response Operational Guide\". response.pagerduty.com.",
    "origin_source_zh": "Beyer, B.等（2016）「网站可靠性工程」，第14章，Google/O'Reilly；PagerDuty（2016）「事故响应操作指南」，response.pagerduty.com",
    "complexity": "intermediate",
    "when_to_use": [
      "When the team has grown past 5-6 engineers and ad-hoc 'all hands on deck' incident response results in duplicated effort, confused ownership, and unclear resolution",
      "When post-incident reviews reveal that incidents were resolved slowly because no single person was responsible for coordinating the response",
      "When customer-facing SLOs are being violated and the organization needs consistent, measurable incident response to reduce MTTR and improve transparency",
      "When regulatory compliance (SOC 2, ISO 27001, HIPAA) requires documented incident response procedures and evidence of their consistent application"
    ],
    "when_to_use_zh": [
      "当团队超过5-6名工程师，临时的「全员上阵」事故响应导致重复工作、所有权混乱和不明确的解决时",
      "当事后审查揭示事故解决缓慢，因为没有单一负责人协调响应时",
      "当面向客户的SLO被违反，组织需要一致、可衡量的事故响应以减少MTTR和提高透明度时",
      "当法规合规（SOC 2、ISO 27001、HIPAA）要求有记录的事故响应程序及其一致应用的证据时"
    ],
    "core_concepts": [
      "Incident Commander (IC): The single decision-making authority during an incident who coordinates responders, manages communication, and ensures the response doesn't fragment across multiple competing threads",
      "Severity Levels (SEV1-SEV4): Standardized impact classification — SEV1 (all users affected, revenue impacted), SEV2 (significant users affected), SEV3 (partial degradation, workaround available), SEV4 (minor impact, monitoring)",
      "Incident Timeline: A chronological log of every detection, investigation step, action taken, and recovery verification, maintained in real time during the incident to support accurate post-mortem analysis",
      "Blameless Postmortem: A structured retrospective that identifies systemic factors contributing to an incident without attributing blame to individuals, producing specific, actionable improvements",
      "On-Call Handoff: A standardized briefing given at shift change during long-running incidents to transfer context, current status, and next actions from the outgoing to the incoming responder"
    ],
    "core_concepts_zh": [
      "事故指挥官（IC）：事故期间的单一决策权威，协调响应者、管理沟通，并确保响应不会分散到多个竞争线程中",
      "严重性级别（SEV1-SEV4）：标准化影响分类——SEV1（所有用户受影响，收入受损）、SEV2（大量用户受影响）、SEV3（部分降级，有变通方法）、SEV4（微小影响，监控中）",
      "事故时间线：在事故期间实时维护的每次检测、调查步骤、采取的行动和恢复验证的时间顺序日志，以支持准确的事后分析",
      "无指责事后审查：一种结构化回顾，在不将责任归咎于个人的情况下识别导致事故的系统性因素，产生具体、可操作的改进",
      "值班交接：在长时间运行的事故期间班次变更时给出的标准化简报，将背景、当前状态和下一步行动从离任响应者转移给接任响应者"
    ],
    "timeline": [
      [
        "2003",
        "Google SRE team develops the Incident Command System adapted from emergency services, introducing the Incident Commander role to software operations"
      ],
      [
        "2012",
        "Etsy publishes its blameless post-mortem culture blog post, normalizing psychological safety in incident response across the tech industry"
      ],
      [
        "2016",
        "PagerDuty open-sources its Incident Response Operational Guide, providing a detailed playbook adopted by thousands of organizations"
      ],
      [
        "2018",
        "Atlassian publishes its Incident Management Handbook and StatusPage integrations, making incident communication tooling widely accessible"
      ],
      [
        "2022",
        "Incident.io and Rootly launch AI-assisted incident management platforms that auto-generate timelines, suggest runbook steps, and draft post-mortems"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "Google SRE团队开发了改编自紧急服务的事故指挥系统，将事故指挥官角色引入软件运营"
      ],
      [
        "2012",
        "Etsy发布其无指责事后分析文化博客文章，在科技行业中将心理安全正常化"
      ],
      [
        "2016",
        "PagerDuty开源其事故响应操作指南，提供被数千个组织采用的详细操作手册"
      ],
      [
        "2018",
        "Atlassian发布其事故管理手册和StatusPage集成，使事故沟通工具广泛可及"
      ],
      [
        "2022",
        "Incident.io和Rootly推出AI辅助事故管理平台，自动生成时间线、建议运行手册步骤并起草事后分析"
      ]
    ],
    "dos": [
      "Do designate a single Incident Commander who owns the incident end-to-end and has authority to make decisions, even if they are not the most technically expert person in the room",
      "Do open a dedicated incident Slack channel (or equivalent) immediately upon declaration so that all communication is centralized and the audit trail is preserved",
      "Do conduct post-incident reviews for every SEV1 and SEV2, and a representative sample of SEV3 incidents, because the patterns in lower-severity incidents predict future SEV1s",
      "Do track action items from post-mortems in the same project management system as product work and report on their completion rate to leadership quarterly"
    ],
    "dos_zh": [
      "指定单一事故指挥官从头到尾拥有事故并有权做出决定，即使他们不是房间里技术最熟练的人",
      "在声明事故后立即开启专用的事故Slack频道（或等效工具），以便所有通信集中且审计跟踪得到保存",
      "对每个SEV1和SEV2以及代表性SEV3事故样本进行事后审查，因为较低严重性事故中的模式预测未来的SEV1",
      "在与产品工作相同的项目管理系统中跟踪事后分析的行动项目，并每季度向领导层报告其完成率"
    ],
    "donts": [
      "Don't allow incidents to proceed without a declared Incident Commander because uncoordinated responses cause duplicate work, missed steps, and confused external communication",
      "Don't delay the post-mortem beyond 5 business days because memory of the incident details fades quickly and the urgency to fix contributing factors dissipates",
      "Don't use post-mortems to assign individual blame because blame suppresses honest reporting of mistakes, which is the primary source of learning in incident response",
      "Don't close action items from post-mortems without evidence of completion because unclosed action items are the primary driver of recurring incidents"
    ],
    "donts_zh": [
      "不要在没有声明事故指挥官的情况下进行事故响应，因为不协调的响应会导致重复工作、遗漏步骤和混乱的外部沟通",
      "不要将事后分析延迟超过5个工作日，因为事故细节的记忆很快消退，修复促成因素的紧迫感也会消散",
      "不要使用事后分析来追究个人责任，因为责任会压制对错误的诚实报告，而这是事故响应学习的主要来源",
      "不要在没有完成证据的情况下关闭事后分析的行动项目，因为未关闭的行动项目是重复事故的主要驱动因素"
    ],
    "case_study_company": "Atlassian",
    "case_study": "Atlassian experienced a major outage in April 2022 affecting over 400 enterprise customers for up to two weeks. In its public post-mortem, Atlassian documented how its incident management framework was stress-tested at an unprecedented scale. The company credited its Incident Commander model with preventing fragmented response across multiple time zones and teams. The post-mortem identified that an automated script executed a deletion command on a wider scope than intended, and that multiple safeguards failed simultaneously. Atlassian published 8 specific engineering improvements with owners and completion dates, setting a new standard for transparent enterprise incident communication.",
    "case_study_zh": "Atlassian在2022年4月经历了一次影响400多个企业客户长达两周的重大停机。在其公开事后分析中，Atlassian记录了其事故管理框架如何在前所未有的规模下经受考验。该公司将事故指挥官模型归功于防止跨多个时区和团队的分散响应。事后分析确定了自动化脚本在比预期更广泛的范围内执行了删除命令，以及多个保障措施同时失效。Atlassian发布了8项具有所有者和完成日期的具体工程改进，为透明的企业事故沟通树立了新标准。",
    "when_not_to_use": [
      "Small startups with fewer than 5 engineers where informal coordination is sufficient and a formal IC structure would add overhead without benefit",
      "Planned maintenance windows that are not true incidents but may use similar communication channels",
      "Non-production incidents (development environment failures, CI pipeline outages) that do not affect customers and do not require the full incident response ceremony",
      "Teams running on a pure infrastructure-as-a-service model with full managed service SLAs where the cloud provider owns incident response for the underlying platform"
    ],
    "when_not_to_use_zh": [
      "少于5名工程师的小型创业公司，非正式协调已经足够，正式IC结构会增加开销而没有收益",
      "计划的维护窗口，不是真正的事故，但可能使用类似的沟通渠道",
      "不影响客户且不需要完整事故响应流程的非生产事故（开发环境故障、CI流水线中断）",
      "在纯基础设施即服务模型上运行的团队，具有完整托管服务SLA，云提供商拥有底层平台的事故响应"
    ],
    "adopters": [
      "Google",
      "Atlassian",
      "PagerDuty",
      "Stripe",
      "Netflix"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "maintainability",
      "usability"
    ],
    "maturity_ring": "established",
    "primary_source": "Beyer, B., Jones, C., Petoff, J. & Murphy, N.R. (2016). \"Site Reliability Engineering\", Ch. 14. O'Reilly Media.",
    "secondary_sources": [
      "PagerDuty (2016). \"Incident Response Operational Guide\". response.pagerduty.com.",
      "Atlassian (2018). \"Incident Management Handbook\". atlassian.com/incident-management.",
      "Allspaw, J. & Hammond, P. (2009). \"10+ Deploys Per Day\". Velocity Conference, O'Reilly."
    ],
    "typed_relations": [
      {
        "slug": "blameless-postmortems",
        "type": "complement"
      },
      {
        "slug": "runbook-automation",
        "type": "complement"
      },
      {
        "slug": "on-call-rotation-design",
        "type": "complement"
      }
    ]
  },
  {
    "id": 223,
    "name": "Chaos Observability",
    "name_zh": "混沌可观测性",
    "slug": "chaos-observability",
    "category": "observability",
    "desc": "Observability practices specifically designed for chaos engineering experiments and resilience validation",
    "desc_zh": "专为混沌工程实验和弹性验证设计的可观测性实践",
    "steps": [
      "Establish a pre-experiment observability baseline: capture screenshots or snapshots of dashboards (latency percentiles, error rates, saturation metrics) 15 minutes before the chaos experiment starts",
      "Define the steady-state hypothesis in observable terms: 'p99 latency remains below 500ms', 'error rate stays under 0.1%', 'all health check endpoints return 200' — these are the abort conditions if violated",
      "Instrument the chaos experiment itself with a start event and end event emitted as structured log events or trace spans with experiment_id, target, and fault_type labels so the experiment appears in dashboards",
      "Monitor observability signals in real time during the experiment; configure an automatic abort trigger that halts the experiment if any steady-state condition is violated beyond the defined blast radius",
      "Conduct a post-experiment analysis comparing the pre-experiment baseline to in-experiment and post-experiment metrics to validate whether the system behaved as expected and whether recovery was complete"
    ],
    "steps_zh": [
      "建立实验前可观测性基线：在混沌实验开始前15分钟捕获仪表板的截图或快照（延迟百分位数、错误率、饱和度指标）",
      "以可观测的术语定义稳态假设：「p99延迟保持在500ms以下」、「错误率保持在0.1%以下」、「所有健康检查端点返回200」——如果违反，这些是中止条件",
      "对混沌实验本身进行仪器化，发出带有experiment_id、target和fault_type标签的结构化日志事件或追踪span作为开始事件和结束事件，使实验出现在仪表板中",
      "在实验期间实时监控可观测性信号；配置自动中止触发器，如果任何稳态条件在定义的爆炸半径之外被违反，则停止实验",
      "进行实验后分析，比较实验前基线与实验中和实验后指标，以验证系统是否按预期运行，以及恢复是否完整"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Baseline Snapshot",
      "Steady-State Hypothesis",
      "Instrument Experiment",
      "Abort Trigger",
      "Post-Analysis"
    ],
    "viz_labels_zh": [
      "基线快照",
      "稳态假设",
      "实验埋点",
      "中止触发",
      "事后分析"
    ],
    "related": [
      "slo-as-practice",
      "opentelemetry"
    ],
    "tags": [
      "chaos-engineering",
      "resilience",
      "fault-injection",
      "steady-state",
      "blast-radius"
    ],
    "origin_author": "Netflix Chaos Monkey (2011); chaos engineering principles formalized by Rosenthal, Casey et al. (2017, Chaos Engineering book)",
    "origin_source": "Rosenthal, C. et al. (2017). \"Chaos Engineering\". O'Reilly Media; Basiri, A. et al. (2016). \"Chaos Engineering\". IEEE Software, 33(3).",
    "origin_source_zh": "Rosenthal, C.等（2017）「混沌工程」，O'Reilly Media；Basiri, A.等（2016）「混沌工程」，IEEE Software，33(3)",
    "complexity": "advanced",
    "when_to_use": [
      "When the team has completed basic chaos experiments and wants to validate that observability signals faithfully reflect system behavior during failures",
      "When conducting chaos experiments in production or staging and need safety mechanisms to automatically abort experiments that cause unexpected customer impact",
      "When building the organizational confidence to run chaos experiments in production by demonstrating that monitoring coverage is sufficient to detect and respond to emergent failures",
      "When testing whether the on-call alerting stack (SLO alerts, anomaly detectors) fires appropriately during a specific failure mode before that failure mode occurs organically"
    ],
    "when_to_use_zh": [
      "当团队完成了基本混沌实验，并希望验证可观测性信号在故障期间是否忠实反映系统行为时",
      "当在生产或预发布环境中进行混沌实验，需要安全机制自动中止导致意外客户影响的实验时",
      "当通过证明监控覆盖足以检测和响应涌现故障，来建立在生产中运行混沌实验的组织信心时",
      "当在特定故障模式有机发生之前，测试值班告警栈（SLO告警、异常检测器）是否在该故障模式期间适当触发时"
    ],
    "core_concepts": [
      "Steady-State Hypothesis: A pre-defined set of observable system behaviors (SLO metrics, health checks, queue depths) that must remain within acceptable bounds throughout a chaos experiment",
      "Blast Radius: The intentional scope limitation of a chaos experiment — restricted to a percentage of traffic, a single region, or specific pod labels — to limit customer impact if the experiment reveals unexpected behavior",
      "Experiment Annotation: A marker injected into the metrics and logging pipeline at experiment start and end, enabling engineers to overlay experiment windows on dashboards and correlate observability signals with fault injection",
      "Automatic Abort: A safety mechanism that monitors steady-state conditions during the experiment and terminates the experiment if conditions are violated beyond the blast radius, preventing runaway failures",
      "Observability-First Chaos: The principle that chaos experiments should only be run in systems that already have sufficient observability coverage — you cannot validate resilience if you cannot observe what happens during the failure"
    ],
    "core_concepts_zh": [
      "稳态假设：一组预定义的可观测系统行为（SLO指标、健康检查、队列深度），在整个混沌实验过程中必须保持在可接受的范围内",
      "爆炸半径：混沌实验的故意范围限制——限制为流量的一个百分比、单个区域或特定Pod标签——以便在实验揭示意外行为时限制客户影响",
      "实验注释：在实验开始和结束时注入指标和日志流水线的标记，使工程师能够在仪表板上叠加实验窗口并将可观测性信号与故障注入关联",
      "自动中止：一种安全机制，在实验期间监控稳态条件，如果条件在爆炸半径之外被违反，则终止实验，防止失控故障",
      "可观测性优先的混沌：混沌实验只应在已经具有足够可观测性覆盖的系统中运行的原则——如果你不能观察到故障期间发生的事情，你就无法验证弹性"
    ],
    "timeline": [
      [
        "2011",
        "Netflix releases Chaos Monkey, injecting random instance terminations into production to validate auto-recovery mechanisms"
      ],
      [
        "2014",
        "Netflix expands to the Simian Army (Chaos Gorilla, Chaos Kong), testing availability zone and region-level failures with improved observability hooks"
      ],
      [
        "2016",
        "Rosenthal and Basiri publish principles of chaos engineering in IEEE Software, formalizing the steady-state hypothesis and blast radius concepts"
      ],
      [
        "2017",
        "Gremlin launches as a commercial chaos engineering platform with built-in safety halt mechanisms and observability integrations"
      ],
      [
        "2022",
        "OpenTelemetry Chaos extension proposal emerges, aiming to standardize experiment annotation as a first-class observability signal alongside traces"
      ]
    ],
    "timeline_zh": [
      [
        "2011",
        "Netflix发布Chaos Monkey，向生产中注入随机实例终止以验证自动恢复机制"
      ],
      [
        "2014",
        "Netflix扩展到Simian Army（Chaos Gorilla、Chaos Kong），测试可用区和区域级故障，并改进了可观测性钩子"
      ],
      [
        "2016",
        "Rosenthal和Basiri在IEEE Software上发表混沌工程原则，正式化了稳态假设和爆炸半径概念"
      ],
      [
        "2017",
        "Gremlin作为商业混沌工程平台推出，内置安全停止机制和可观测性集成"
      ],
      [
        "2022",
        "OpenTelemetry混沌扩展提案出现，旨在将实验注释作为与追踪并列的一级可观测性信号标准化"
      ]
    ],
    "dos": [
      "Do define and validate the steady-state hypothesis before running the experiment because without a clear definition of 'working correctly', you cannot determine whether the chaos experiment revealed a problem",
      "Do annotate experiment start and end events in your metrics system so that engineers can overlay the experiment window on any dashboard to correlate signals with the fault injection",
      "Do start with the smallest possible blast radius (1% of traffic, a single instance) and expand only when you have confidence in your observability coverage and abort mechanisms",
      "Do treat a chaos experiment that doesn't trigger an alert as valuable negative data — it either validates resilience or reveals a gap in observability coverage"
    ],
    "dos_zh": [
      "在运行实验之前定义并验证稳态假设，因为如果没有「正常运行」的清晰定义，您就无法确定混沌实验是否揭示了问题",
      "在指标系统中注释实验开始和结束事件，以便工程师可以在任何仪表板上叠加实验窗口，将信号与故障注入关联",
      "从尽可能小的爆炸半径开始（1%的流量、单个实例），只有在对可观测性覆盖和中止机制有信心时才扩展",
      "将不触发告警的混沌实验视为有价值的负面数据——它要么验证了弹性，要么揭示了可观测性覆盖中的差距"
    ],
    "donts": [
      "Don't run chaos experiments in systems that lack basic distributed tracing and SLO alerting because you will be unable to detect the impact of fault injection or determine when recovery is complete",
      "Don't skip the blast radius limitation in production because a full-production chaos experiment without a controlled scope is an uncontrolled incident, not an experiment",
      "Don't run chaos experiments during high-traffic business hours or before major product launches without executive approval because the blast radius risk is higher during peak load",
      "Don't treat chaos experiments as a one-time event because system complexity evolves continuously and resilience properties that held 6 months ago may not hold after significant architectural changes"
    ],
    "donts_zh": [
      "不要在缺乏基本分布式追踪和SLO告警的系统中运行混沌实验，因为您将无法检测故障注入的影响或确定恢复何时完成",
      "不要在生产中跳过爆炸半径限制，因为没有受控范围的完整生产混沌实验是不受控制的事故，而不是实验",
      "不要在高流量业务时间或重大产品发布前在没有高管批准的情况下运行混沌实验，因为峰值负载期间爆炸半径风险更高",
      "不要将混沌实验视为一次性事件，因为系统复杂性不断演进，6个月前成立的弹性属性可能在重大架构变更后不再成立"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix's Chaos Engineering team, which invented the discipline, runs thousands of automated chaos experiments per week against its production streaming infrastructure. Their Fault Injection Testing (FIT) platform integrates directly with Netflix's observability stack (Atlas metrics, Edgar tracing) so that every experiment is annotated in dashboards. Engineers can view the exact impact of a fault injection on p99 streaming start time, buffering ratio, and playback errors in real time. The FIT platform enforces automatic experiment abort if error rates exceed 0.1% above baseline, ensuring that experiments never materially impact the 220 million subscribers watching at any given moment. Netflix publishes that 99.99% of FIT experiments are halted within 30 seconds if steady-state conditions are violated.",
    "case_study_zh": "Netflix的混沌工程团队（该学科的发明者）每周对其生产流媒体基础设施运行数千次自动化混沌实验。其故障注入测试（FIT）平台直接与Netflix的可观测性栈（Atlas指标、Edgar追踪）集成，以便每个实验都在仪表板中注释。工程师可以实时查看故障注入对p99流媒体启动时间、缓冲比率和播放错误的确切影响。如果错误率超过基线0.1%以上，FIT平台强制自动中止实验，确保实验永远不会实质性影响任何给定时刻正在观看的2.2亿订阅者。Netflix发布的数据显示，如果稳态条件被违反，99.99%的FIT实验在30秒内被停止。",
    "when_not_to_use": [
      "Systems without basic distributed tracing and SLO-based alerting, where chaos observability would surface a 'no signal' gap rather than meaningful resilience validation",
      "Production systems that have never run chaos experiments in staging, where the first chaos experiment should always be in a lower environment",
      "Regulated financial or healthcare systems where any intentional fault injection requires formal risk assessment and regulatory approval before execution",
      "Teams in early-stage development where the cost of building chaos observability infrastructure exceeds the maturity level of the system being tested"
    ],
    "when_not_to_use_zh": [
      "没有基本分布式追踪和基于SLO告警的系统，混沌可观测性会发现「无信号」差距而非有意义的弹性验证",
      "从未在预发布环境中运行过混沌实验的生产系统，第一次混沌实验应始终在较低环境中进行",
      "任何故意的故障注入在执行前都需要正式风险评估和监管批准的受监管金融或医疗系统",
      "早期开发阶段的团队，构建混沌可观测性基础设施的成本超过被测系统的成熟度水平"
    ],
    "adopters": [
      "Netflix",
      "Gremlin",
      "Amazon",
      "LinkedIn",
      "Microsoft Azure"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "testability",
      "maintainability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "Rosenthal, C. et al. (2017). \"Chaos Engineering: System Resiliency in Practice\". O'Reilly Media.",
    "secondary_sources": [
      "Basiri, A. et al. (2016). \"Chaos Engineering\". IEEE Software, 33(3), pp. 35-41.",
      "Netflix Technology Blog (2014). \"The Netflix Simian Army\". netflixtechblog.com.",
      "Gremlin (2020). \"Chaos Engineering: Breaking Things on Purpose\". gremlin.com."
    ],
    "typed_relations": [
      {
        "slug": "slo-as-practice",
        "type": "complement"
      },
      {
        "slug": "opentelemetry",
        "type": "complement"
      }
    ]
  },
  {
    "id": 224,
    "name": "Cost Observability (FinOps)",
    "name_zh": "成本可观测性（FinOps）",
    "slug": "cost-observability-finops",
    "category": "observability",
    "desc": "Cloud cost monitoring, allocation, and optimization frameworks aligned with the FinOps Foundation model",
    "desc_zh": "与FinOps基金会模型对齐的云成本监控、分配和优化框架",
    "steps": [
      "Instrument cloud cost data with service, team, environment, and feature tags at every cloud resource so that costs can be attributed to specific teams and products rather than appearing as a single undifferentiated bill",
      "Set up a cost observability dashboard aggregating spend by service, team, and environment (production vs. development) with day-over-day and week-over-week cost trends alongside standard infrastructure metrics",
      "Establish unit economics metrics (cost per API call, cost per active user, cost per ML inference) that connect infrastructure spend to business value and enable cost efficiency tracking over time",
      "Create cost anomaly alerts that trigger when any team's cloud spend increases more than 20% week-over-week without a corresponding increase in business metrics, enabling fast detection of runaway costs",
      "Hold monthly FinOps reviews where engineering, product, and finance teams collectively review cost trends, identify optimization opportunities, and prioritize cost reduction initiatives in the product backlog"
    ],
    "steps_zh": [
      "在每个云资源上使用服务、团队、环境和功能标签对云成本数据进行仪器化，以便将成本归因于特定团队和产品，而不是作为单一的未区分账单出现",
      "设置成本可观测性仪表板，按服务、团队和环境（生产与开发）聚合支出，并提供逐日和逐周成本趋势以及标准基础设施指标",
      "建立单位经济学指标（每次API调用成本、每个活跃用户成本、每次ML推断成本），将基础设施支出与业务价值连接起来，并随时间跟踪成本效率",
      "创建成本异常告警，当任何团队的云支出在没有相应业务指标增加的情况下环比增加超过20%时触发，以便快速检测失控成本",
      "举行月度FinOps评审，工程、产品和财务团队共同审查成本趋势，识别优化机会，并在产品待办列表中优先排列成本削减举措"
    ],
    "ai_relevant": true,
    "viz_type": "matrix",
    "viz_labels": [
      "Cost Tagging",
      "Cost Dashboard",
      "Unit Economics",
      "Cost Anomaly Alert",
      "FinOps Review"
    ],
    "viz_labels_zh": [
      "成本标签",
      "成本看板",
      "单位经济",
      "异常告警",
      "FinOps评审"
    ],
    "related": [
      "slo-as-practice",
      "platform-engineering"
    ],
    "tags": [
      "finops",
      "cloud-cost",
      "cost-optimization",
      "unit-economics",
      "tagging"
    ],
    "origin_author": "FinOps Foundation (J.R. Storment & Mike Fuller, 2019); Cloud Financial Management practices emerging from AWS Cost Explorer (2014)",
    "origin_source": "FinOps Foundation (2019). \"Cloud Financial Management\". finops.org; Storment, J.R. & Fuller, M. (2019). \"Cloud FinOps\". O'Reilly Media.",
    "origin_source_zh": "FinOps基金会（2019）「云财务管理」，finops.org；Storment, J.R. & Fuller, M.（2019）「云FinOps」，O'Reilly Media",
    "complexity": "intermediate",
    "when_to_use": [
      "When cloud costs are growing faster than revenue or user growth and the organization cannot attribute cost increases to specific teams, features, or products",
      "When the engineering team receives a monthly cloud bill with no cost attribution data and finance and product teams cannot hold engineering accountable for cost efficiency",
      "When cost optimization is a strategic priority but there is no mechanism to measure the impact of optimization efforts on actual spend",
      "When AI/ML workloads are being adopted at scale and GPU and inference costs are creating new, unpredictable cost categories that require dedicated observability"
    ],
    "when_to_use_zh": [
      "当云成本的增长速度快于收入或用户增长，且组织无法将成本增加归因于特定团队、功能或产品时",
      "当工程团队收到没有成本归因数据的每月云账单，财务和产品团队无法让工程团队对成本效率负责时",
      "当成本优化是战略优先事项，但没有机制来衡量优化工作对实际支出的影响时",
      "当AI/ML工作负载被大规模采用，GPU和推断成本正在创造需要专用可观测性的新的不可预测成本类别时"
    ],
    "core_concepts": [
      "Cost Attribution: The practice of tagging every cloud resource with metadata (team, service, environment, feature) so that costs can be aggregated and reported at the team and product level",
      "Unit Economics: Normalizing infrastructure cost against a business metric (users, requests, inferences) to produce a cost-per-unit metric that decouples absolute spend from business growth",
      "FinOps Lifecycle: The Inform-Optimize-Operate cycle from the FinOps Foundation model — first make costs visible (Inform), then identify and implement savings (Optimize), then embed cost governance in day-to-day engineering (Operate)",
      "Rightsizing: Matching cloud resource sizes to actual usage patterns by identifying over-provisioned instances, unused reserved capacity, and idle resources that can be downsized or terminated",
      "Commitment-Based Discounts: Reserved Instances, Savings Plans (AWS), and Committed Use Discounts (GCP) that trade flexibility for cost savings of 30-70% over on-demand pricing"
    ],
    "core_concepts_zh": [
      "成本归因：用元数据（团队、服务、环境、功能）标记每个云资源的实践，以便可以在团队和产品层面汇总和报告成本",
      "单位经济学：将基础设施成本归一化为业务指标（用户、请求、推断），产生每单位成本指标，将绝对支出与业务增长解耦",
      "FinOps生命周期：来自FinOps基金会模型的通知-优化-运营循环——首先使成本可见（通知），然后识别并实施节省（优化），然后将成本治理嵌入日常工程中（运营）",
      "合理调整规模：通过识别过度配置的实例、未使用的预留容量和可以缩减或终止的空闲资源，将云资源大小与实际使用模式匹配",
      "基于承诺的折扣：预留实例、节省计划（AWS）和承诺使用折扣（GCP），以灵活性换取比按需定价节省30-70%的成本"
    ],
    "timeline": [
      [
        "2014",
        "AWS releases Cost Explorer, providing the first cloud-native cost visualization tool that enables spend analysis by service and time period"
      ],
      [
        "2019",
        "FinOps Foundation founded by J.R. Storment and Mike Fuller; publishes the FinOps Framework and Cloud Financial Management best practices"
      ],
      [
        "2020",
        "Major cloud providers (AWS, Azure, GCP) launch cost anomaly detection services, making automated cost alerting accessible without custom tooling"
      ],
      [
        "2022",
        "AI/ML workload costs explode; FinOps Foundation releases GPU and inference cost management guidance as a specialized domain"
      ],
      [
        "2024",
        "Engineering platforms (Internal Developer Portals) begin embedding per-service cost dashboards directly in developer workflows, making cost visibility a first-class engineering concern"
      ]
    ],
    "timeline_zh": [
      [
        "2014",
        "AWS发布Cost Explorer，提供第一个云原生成本可视化工具，支持按服务和时间段进行支出分析"
      ],
      [
        "2019",
        "FinOps基金会由J.R. Storment和Mike Fuller创立；发布FinOps框架和云财务管理最佳实践"
      ],
      [
        "2020",
        "主要云提供商（AWS、Azure、GCP）推出成本异常检测服务，使自动化成本告警无需自定义工具即可访问"
      ],
      [
        "2022",
        "AI/ML工作负载成本爆炸；FinOps基金会发布GPU和推断成本管理指南作为专业领域"
      ],
      [
        "2024",
        "工程平台（内部开发者门户）开始将按服务成本仪表板直接嵌入开发者工作流，使成本可见性成为一级工程关注点"
      ]
    ],
    "dos": [
      "Do implement resource tagging standards before deploying any cost observability tooling because without consistent tags, cost attribution is impossible and the tooling will produce unactionable data",
      "Do set per-team cloud cost budgets with automated alerts at 80% and 100% of monthly budget so that teams have visibility into their own spending before the bill arrives",
      "Do establish unit cost metrics (cost per API request, cost per active user) and track them as part of each team's OKRs so that cost efficiency is considered alongside feature velocity",
      "Do involve finance and product stakeholders in monthly FinOps reviews because cost optimization decisions require business context that engineering teams alone do not have"
    ],
    "dos_zh": [
      "在部署任何成本可观测性工具之前实施资源标签标准，因为没有一致的标签，成本归因是不可能的，工具将产生无法操作的数据",
      "设置带有月度预算80%和100%自动告警的按团队云成本预算，使团队在账单到来之前对自己的支出有可见性",
      "建立单位成本指标（每次API请求成本、每个活跃用户成本），并将其作为每个团队OKR的一部分进行跟踪，使成本效率与功能速度一起被考虑",
      "在月度FinOps评审中让财务和产品利益相关方参与，因为成本优化决策需要工程团队单独不具备的业务背景"
    ],
    "donts": [
      "Don't treat cloud cost as solely a finance problem — engineering decisions (instance types, data transfer patterns, retention policies) are the primary levers for cost control and engineers must own them",
      "Don't optimize for cost at the expense of reliability without explicit business trade-off analysis because cutting costs that undermine availability SLOs will cost more in incident response and customer churn",
      "Don't enforce rigid cost budgets on teams without giving them the tooling and authority to make the changes needed to stay within budget",
      "Don't skip cost tagging for development and staging environments because non-production costs often represent 20-40% of total cloud spend and hiding them makes cost attribution inaccurate"
    ],
    "donts_zh": [
      "不要将云成本视为纯粹的财务问题——工程决策（实例类型、数据传输模式、保留策略）是成本控制的主要杠杆，工程师必须拥有它们",
      "不要在没有明确业务权衡分析的情况下以牺牲可靠性为代价优化成本，因为削减损害可用性SLO的成本将在事故响应和客户流失中花费更多",
      "不要在不给团队提供工具和权力来做出保持预算内所需变更的情况下对团队强制执行严格的成本预算",
      "不要跳过开发和预发布环境的成本标签，因为非生产成本通常占总云支出的20-40%，隐藏它们会使成本归因不准确"
    ],
    "case_study_company": "Spotify",
    "case_study": "Spotify moved from a single cloud bill to a fully attributed cost model over 18 months, tagging every GCP resource with squad, tribe, and product area labels. By building a FinOps dashboard embedded in their internal developer portal, squad leads could see their real-time cloud spend alongside their service SLOs and deployment frequency. Within 6 months of implementing cost visibility, engineering squads voluntarily right-sized 23% of their compute instances (reducing average instance cost by $340/month per service) and cleaned up $2.1M in abandoned development environments. Spotify credited the visibility-first approach — making engineers the primary owners of cost data, not finance — as the key factor in achieving these savings without top-down mandates.",
    "case_study_zh": "Spotify在18个月内从单一云账单过渡到完全归因的成本模型，用小组、部落和产品区域标签标记每个GCP资源。通过构建嵌入其内部开发者门户的FinOps仪表板，小组负责人可以看到其实时云支出以及服务SLO和部署频率。在实施成本可见性的6个月内，工程小组自愿将23%的计算实例合理调整大小（每个服务每月平均实例成本降低340美元），并清理了210万美元的废弃开发环境。Spotify将可见性优先的方法——让工程师成为成本数据的主要所有者，而不是财务部门——归功于在没有自上而下授权的情况下实现这些节省的关键因素。",
    "when_not_to_use": [
      "Early-stage startups with cloud bills under $10,000/month where the overhead of implementing cost attribution infrastructure exceeds the potential savings",
      "Organizations on fixed-price cloud contracts where the marginal cost of individual resource decisions is already absorbed in the contract price",
      "Teams in the initial months of a major migration project where cost optimization would slow down the migration and optimization should follow after the migration stabilizes",
      "On-premise or private cloud deployments where the cost model is dominated by CapEx hardware depreciation rather than OpEx per-resource consumption"
    ],
    "when_not_to_use_zh": [
      "月云账单低于10000美元的早期创业公司，实施成本归因基础设施的开销超过潜在节省",
      "固定价格云合同的组织，单个资源决策的边际成本已经被合同价格吸收",
      "重大迁移项目初期的团队，成本优化会减慢迁移速度，优化应在迁移稳定后进行",
      "本地或私有云部署，成本模型主要由资本支出硬件折旧主导，而非按资源消耗的运营支出"
    ],
    "adopters": [
      "Spotify",
      "Airbnb",
      "Pinterest",
      "Lyft",
      "Snap"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "scalability",
      "reliability"
    ],
    "maturity_ring": "established",
    "primary_source": "FinOps Foundation (2019). \"Cloud Financial Management Framework\". finops.org.",
    "secondary_sources": [
      "Storment, J.R. & Fuller, M. (2019). \"Cloud FinOps: Collaborative, Real-Time Cloud Financial Management\". O'Reilly Media.",
      "AWS (2022). \"AWS Well-Architected Framework: Cost Optimization Pillar\". docs.aws.amazon.com.",
      "Google Cloud (2021). \"Google Cloud Cost Management Best Practices\". cloud.google.com."
    ],
    "typed_relations": [
      {
        "slug": "slo-as-practice",
        "type": "complement"
      },
      {
        "slug": "platform-engineering",
        "type": "complement"
      }
    ]
  },
  {
    "id": 272,
    "name": "Observability-as-Code",
    "name_zh": "可观测性即代码",
    "slug": "observability-as-code",
    "category": "observability",
    "desc": "Defining monitoring, alerts, and dashboards as version-controlled code to ensure reproducible, auditable observability infrastructure",
    "desc_zh": "将监控、告警和仪表盘定义为版本控制代码，确保可观测性基础设施的可复现性和可审计性",
    "steps": [
      "Codify dashboards and alert rules using tools like Terraform, Pulumi, or vendor-specific providers (Grafana-as-code, Datadog Terraform provider) so every observability artifact lives in source control",
      "Establish a GitOps workflow: all changes to monitors, dashboards, and SLO definitions go through pull requests with peer review and CI validation before being applied to production",
      "Use templating and modules to DRY up common observability patterns — a single Terraform module can generate standard latency/error/saturation dashboards for every microservice from a variables map",
      "Integrate observability provisioning into service scaffolding so new services automatically get baseline dashboards, alert rules, and log queries without manual setup by platform teams",
      "Apply environment promotion: observability configs progress from dev → staging → prod in lockstep with service deployments, preventing dashboards from drifting out of sync with the services they monitor"
    ],
    "steps_zh": [
      "使用Terraform、Pulumi或厂商特定提供商（Grafana-as-code、Datadog Terraform提供商）将仪表盘和告警规则代码化，使所有可观测性制品都纳入源码管理",
      "建立GitOps工作流：对监控器、仪表盘和SLO定义的所有变更都通过拉取请求经同行评审和CI验证后才应用到生产环境",
      "使用模板化和模块化来消除常见可观测性模式的重复——单个Terraform模块可以从变量映射为每个微服务生成标准的延迟/错误/饱和度仪表盘",
      "将可观测性配置集成到服务脚手架中，使新服务无需平台团队手动配置即可自动获得基线仪表盘、告警规则和日志查询",
      "应用环境晋级：可观测性配置与服务部署同步从开发→预发布→生产逐级推进，防止仪表盘与其监控的服务脱节"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Codify Dashboards",
      "GitOps Workflow",
      "DRY Templates",
      "Auto-Provision",
      "Env Promotion"
    ],
    "viz_labels_zh": [
      "代码化看板",
      "GitOps流程",
      "复用模板",
      "自动供给",
      "环境晋级"
    ],
    "related": [
      "opentelemetry",
      "sli-slo-sla",
      "four-golden-signals",
      "infrastructure-as-code",
      "gitops"
    ],
    "tags": [
      "observability",
      "infrastructure-as-code",
      "gitops",
      "monitoring",
      "dashboards",
      "alerts",
      "terraform"
    ],
    "origin_author": "Hashicorp",
    "origin_source": "HashiCorp Terraform Grafana Provider documentation and community practice, 2019",
    "origin_source_zh": "HashiCorp Terraform Grafana提供商文档及社区实践，2019年",
    "complexity": "intermediate",
    "when_to_use": [
      "Teams managing many microservices where manual dashboard and alert creation creates inconsistency and configuration drift across services",
      "Organizations practicing GitOps where infrastructure changes require peer review and audit trails — extending this discipline to observability configs",
      "Platform engineering teams who need to provision standard observability for dozens or hundreds of services with minimal per-service manual effort",
      "Post-incident reviews reveal that missing or stale alerts failed to catch issues, motivating a code-review process for alert rule changes"
    ],
    "when_to_use_zh": [
      "管理众多微服务的团队，手动创建仪表盘和告警导致服务间配置不一致和漂移",
      "实践GitOps的组织，基础设施变更需要同行评审和审计追踪——将这一规范延伸到可观测性配置",
      "需要以最少手动工作量为数十或数百个服务配置标准可观测性的平台工程团队",
      "事后复盘揭示缺失或过时的告警未能捕获问题，促使对告警规则变更实施代码评审流程"
    ],
    "core_concepts": [
      "Infrastructure-as-Code for Observability: treating dashboards, alert rules, SLO configs, and log queries as first-class infrastructure artifacts stored in Git and applied via automated pipelines",
      "GitOps Workflow: all observability changes go through version-controlled pull requests, enabling peer review, change history, rollback, and audit trails for every monitoring configuration change",
      "Templated Observability: using Terraform modules, Jsonnet, or Grafonnet to generate consistent dashboards from service metadata, eliminating per-service manual dashboard creation",
      "Environment Promotion: observability configurations follow the same promotion lifecycle as application code, ensuring dashboards and alerts are in sync with deployed service versions",
      "Self-Service Provisioning: service teams define their observability requirements in code and the platform automatically applies them, removing bottlenecks from central platform teams"
    ],
    "core_concepts_zh": [
      "可观测性基础设施即代码：将仪表盘、告警规则、SLO配置和日志查询视为一等基础设施制品，存储在Git中并通过自动化管道应用",
      "GitOps工作流：所有可观测性变更通过版本控制的拉取请求进行，为每次监控配置变更提供同行评审、变更历史、回滚和审计追踪能力",
      "模板化可观测性：使用Terraform模块、Jsonnet或Grafonnet从服务元数据生成一致的仪表盘，消除按服务手动创建仪表盘的工作",
      "环境晋级：可观测性配置遵循与应用代码相同的晋级生命周期，确保仪表盘和告警与已部署的服务版本同步",
      "自助式配置：服务团队以代码形式定义其可观测性需求，平台自动应用，消除中央平台团队的瓶颈"
    ],
    "timeline": [
      [
        "2014",
        "Terraform released by HashiCorp, establishing infrastructure-as-code as a mainstream practice applicable to all cloud resources"
      ],
      [
        "2019",
        "Grafana and Datadog release Terraform providers, enabling dashboards and alerts to be managed as code in the same workflow as infrastructure"
      ],
      [
        "2021",
        "Grafana Labs publishes Grafonnet (Jsonnet library) and Grafana-as-code tooling, making dashboard templating first-class"
      ],
      [
        "2023",
        "Platform engineering teams adopt observability-as-code as a core Internal Developer Portal (IDP) capability alongside service scaffolding"
      ]
    ],
    "timeline_zh": [
      [
        "2014",
        "HashiCorp发布Terraform，将基础设施即代码确立为适用于所有云资源的主流实践"
      ],
      [
        "2019",
        "Grafana和Datadog发布Terraform提供商，使仪表盘和告警能够在与基础设施相同的工作流中以代码形式管理"
      ],
      [
        "2021",
        "Grafana Labs发布Grafonnet（Jsonnet库）和Grafana-as-code工具，使仪表盘模板化成为一等公民"
      ],
      [
        "2023",
        "平台工程团队将可观测性即代码采纳为内部开发者门户（IDP）的核心能力，与服务脚手架并列"
      ]
    ],
    "dos": [
      "Do store observability configs in the same repository as the service they monitor so that dashboard changes are reviewed alongside application code changes in the same pull request",
      "Do use modules and templates aggressively to ensure all services get the same baseline dashboards and reduce the cost of maintaining hundreds of individual dashboard files",
      "Do validate observability configs in CI — run terraform plan or equivalent before merging to catch syntax errors and unintended resource deletions before they reach production",
      "Do treat alert rule changes with the same rigor as production code changes, requiring at least one reviewer who understands the alert's purpose and threshold rationale"
    ],
    "dos_zh": [
      "将可观测性配置与其监控的服务存储在同一仓库中，以便仪表盘变更与应用代码变更在同一拉取请求中一起评审",
      "积极使用模块和模板确保所有服务获得相同的基线仪表盘，降低维护数百个独立仪表盘文件的成本",
      "在CI中验证可观测性配置——在合并前运行terraform plan或等效命令，在生产前捕获语法错误和意外资源删除",
      "以与生产代码变更相同的严格程度对待告警规则变更，要求至少有一名了解告警目的和阈值依据的评审者"
    ],
    "donts": [
      "Don't allow engineers to create dashboards or alerts directly in the observability UI without committing them to code — UI-only configs are invisible to code review and will be lost in a disaster recovery scenario",
      "Don't create a single monolithic dashboard file per service — break observability code into focused, reusable modules (latency module, error module, saturation module) that can be combined",
      "Don't hardcode environment-specific values (thresholds, URLs, team names) in templates — parameterize them so the same module works across dev, staging, and production",
      "Don't skip testing alert rules — write unit tests for threshold logic and integration tests that fire synthetic metrics to verify alerts trigger correctly before deploying to production"
    ],
    "donts_zh": [
      "不要允许工程师直接在可观测性UI中创建仪表盘或告警而不将其提交到代码——仅限UI的配置对代码评审不可见，且在灾难恢复场景中会丢失",
      "不要为每个服务创建单一的庞大仪表盘文件——将可观测性代码分解为专注、可复用的模块（延迟模块、错误模块、饱和度模块），可组合使用",
      "不要在模板中硬编码特定环境的值（阈值、URL、团队名称）——将其参数化，使同一模块可在开发、预发布和生产环境中工作",
      "不要跳过测试告警规则——为阈值逻辑编写单元测试，并编写发送合成指标的集成测试，以在部署到生产环境前验证告警正确触发"
    ],
    "case_study_company": "Monzo",
    "case_study": "Monzo, the UK challenger bank, manages over 1,500 microservices and codified their entire Grafana dashboard estate using Jsonnet templates stored in a central observability repository. Every service registers its metadata (name, team, SLOs) in a service catalog, and a CI pipeline automatically generates and deploys standard RED-method dashboards for each service. When Monzo's SRE team wants to add a new panel to all service dashboards, they make a single template change that propagates to all 1,500+ dashboards in one deployment. This approach eliminated 'ghost dashboards' (UI-only configs that existed nowhere in code), reduced dashboard creation time from hours to under 5 minutes per service, and gave every on-call engineer confidence that the dashboards they see in production are current and authoritative.",
    "case_study_zh": "英国挑战者银行Monzo管理超过1500个微服务，使用存储在中央可观测性仓库中的Jsonnet模板将其整个Grafana仪表盘资产代码化。每个服务在服务目录中注册其元数据（名称、团队、SLO），CI管道自动为每个服务生成和部署标准RED方法仪表盘。当Monzo的SRE团队想为所有服务仪表盘添加新面板时，只需进行一次模板变更，即可在一次部署中传播到所有1500多个仪表盘。这种方法消除了「幽灵仪表盘」（仅存在于UI中而在代码中不存在的配置），将每个服务的仪表盘创建时间从数小时缩短到5分钟以内，并让每位待命工程师相信他们在生产环境中看到的仪表盘是最新且权威的。",
    "when_not_to_use": [
      "Small teams with fewer than 5 services where the overhead of setting up Terraform providers and GitOps pipelines exceeds the benefit of version-controlled observability",
      "Organizations in the early stages of observability maturity where establishing basic instrumentation and alerting is more urgent than codifying existing gaps",
      "Exploratory debugging phases where engineers need to rapidly iterate on dashboard design in the UI before the visualization requirements are stable enough to codify"
    ],
    "when_not_to_use_zh": [
      "拥有少于5个服务的小团队，设置Terraform提供商和GitOps管道的开销超过版本控制可观测性的收益",
      "可观测性成熟度处于早期阶段的组织，建立基本的埋点和告警比将现有差距代码化更紧迫",
      "探索性调试阶段，工程师需要在UI中快速迭代仪表盘设计，在可视化需求稳定到足以代码化之前"
    ],
    "adopters": [
      "Monzo",
      "Grafana Labs",
      "Datadog",
      "HashiCorp",
      "Shopify",
      "Atlassian"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "maintainability",
      "reliability"
    ],
    "maturity_ring": "emerging",
    "primary_source": "HashiCorp (2019). \"Terraform Grafana Provider\". registry.terraform.io/providers/grafana/grafana.",
    "secondary_sources": [
      "Grafana Labs (2021). \"Grafonnet: Jsonnet library for Grafana dashboards\". github.com/grafana/grafonnet.",
      "Majors, C., Fong-Jones, L., Miranda, G. (2022). \"Observability Engineering\". O'Reilly Media.",
      "Skelton, M. & Pais, M. (2019). \"Team Topologies\". IT Revolution Press."
    ],
    "typed_relations": [
      {
        "slug": "opentelemetry",
        "type": "complement"
      },
      {
        "slug": "sli-slo-sla",
        "type": "complement"
      },
      {
        "slug": "four-golden-signals",
        "type": "related"
      }
    ]
  },
  {
    "id": 273,
    "name": "Service Level Indicators (SLI)",
    "name_zh": "服务级别指标",
    "slug": "service-level-indicators",
    "category": "observability",
    "desc": "Quantitative measures of service behavior that define the precise metrics used to assess whether a service is meeting its reliability commitments",
    "desc_zh": "服务行为的量化度量，定义用于评估服务是否满足可靠性承诺的精确指标",
    "steps": [
      "Identify the user-facing behaviors that matter most: for each critical user journey (checkout, login, search), define what 'working correctly' means in measurable terms — latency, availability, correctness, or throughput",
      "Choose SLI metrics that are close to the user experience: prefer request success rates and P99 latency measured at the load balancer or client over internal system metrics like CPU utilization",
      "Define SLI formulas precisely: an availability SLI might be 'the proportion of HTTP requests that return a 2xx or 4xx status code within 2 seconds over a rolling 28-day window'",
      "Instrument and validate: ensure your monitoring system can actually measure the defined SLI with sufficient granularity and that the metric correlates with real user experience degradation",
      "Attach SLIs to SLOs: every SLI must have a corresponding SLO target (e.g., SLI = request success rate, SLO = success rate ≥ 99.9%) and feed into an error budget calculation"
    ],
    "steps_zh": [
      "识别最重要的面向用户的行为：对于每个关键用户旅程（结账、登录、搜索），以可衡量的术语定义「正常工作」的含义——延迟、可用性、正确性或吞吐量",
      "选择贴近用户体验的SLI指标：优先选择在负载均衡器或客户端测量的请求成功率和P99延迟，而非CPU利用率等内部系统指标",
      "精确定义SLI公式：可用性SLI可能是「在28天滚动窗口内，2秒内返回2xx或4xx状态码的HTTP请求比例」",
      "埋点并验证：确保监控系统能够以足够的粒度实际测量定义的SLI，并且该指标与真实用户体验的降级相关联",
      "将SLI与SLO关联：每个SLI必须有对应的SLO目标（例如SLI=请求成功率，SLO=成功率≥99.9%），并纳入错误预算计算"
    ],
    "ai_relevant": true,
    "viz_type": "pyramid",
    "viz_labels": [
      "User Journey",
      "SLI Metrics",
      "SLO Target",
      "Error Budget",
      "Review"
    ],
    "viz_labels_zh": [
      "用户旅程",
      "SLI指标",
      "SLO目标",
      "错误预算",
      "定期评审"
    ],
    "related": [
      "sli-slo-sla",
      "four-golden-signals",
      "red-method",
      "use-method",
      "opentelemetry"
    ],
    "tags": [
      "sli",
      "slo",
      "reliability",
      "sre",
      "google",
      "metrics",
      "observability"
    ],
    "origin_author": "Google SRE",
    "origin_source": "Site Reliability Engineering (Betsy Beyer, Chris Jones, Jennifer Petoff, Niall Richard Murphy, O'Reilly, 2016)",
    "origin_source_zh": "《网站可靠性工程》（Betsy Beyer、Chris Jones、Jennifer Petoff、Niall Richard Murphy，O'Reilly，2016）",
    "complexity": "intermediate",
    "when_to_use": [
      "Establishing reliability commitments with product and business stakeholders that are grounded in measurable user experience data rather than infrastructure uptime proxies",
      "SRE teams needing a shared language with development teams for reliability trade-offs — SLIs make the cost of unreliability concrete and quantifiable",
      "Services experiencing user complaints that do not correlate with any existing infrastructure alert, indicating that current metrics do not capture actual user experience",
      "Before defining SLOs or error budgets, which require well-specified SLIs as their foundation"
    ],
    "when_to_use_zh": [
      "与产品和业务利益相关方建立基于可测量用户体验数据而非基础设施正常运行时间代理指标的可靠性承诺",
      "SRE团队需要与开发团队就可靠性权衡建立共同语言——SLI使不可靠性的成本具体且可量化",
      "服务遭受用户投诉但与任何现有基础设施告警不相关，表明当前指标未能捕获实际用户体验",
      "在定义SLO或错误预算之前，后者需要定义明确的SLI作为基础"
    ],
    "core_concepts": [
      "SLI Definition: a carefully defined quantitative measure of some aspect of the level of service provided — the ratio of good events to total events over a measurement window",
      "Request/Response SLIs: the most common category — availability (fraction of successful requests), latency (fraction of requests faster than a threshold), and error rate (fraction of failed requests)",
      "User-Centric Measurement: SLIs must be measured as close to the user as possible; internal CPU or memory metrics are lagging indicators that often fail to capture user-perceived degradation",
      "Measurement Windows: SLIs are typically calculated over rolling windows (28 days, 30 days) to smooth out short-term noise while still reflecting recent service health trends",
      "SLI → SLO Pipeline: each SLI feeds exactly one SLO target; the SLO converts the SLI ratio into a commitment (99.9% of requests succeed), and the gap between actual and target becomes the error budget"
    ],
    "core_concepts_zh": [
      "SLI定义：对所提供服务级别某个方面的精心定义的量化度量——在测量窗口内良好事件与总事件的比率",
      "请求/响应SLI：最常见的类别——可用性（成功请求的比例）、延迟（快于阈值的请求比例）和错误率（失败请求的比例）",
      "以用户为中心的测量：SLI必须尽可能贴近用户进行测量；内部CPU或内存指标是滞后指标，通常无法捕获用户感知的降级",
      "测量窗口：SLI通常在滚动窗口（28天、30天）内计算，以平滑短期噪声，同时仍能反映近期服务健康趋势",
      "SLI→SLO管道：每个SLI精确对应一个SLO目标；SLO将SLI比率转换为承诺（99.9%的请求成功），实际与目标之间的差距成为错误预算"
    ],
    "timeline": [
      [
        "2003",
        "Google's SRE team begins formalizing service reliability measurement as internal practice, developing the SLI/SLO/SLA framework"
      ],
      [
        "2016",
        "O'Reilly publishes the Google SRE Book, introducing SLI/SLO concepts to the industry; SRE practices begin spreading beyond Google"
      ],
      [
        "2018",
        "The SRE Workbook published with concrete SLI implementation guidance and worked examples for different service types"
      ],
      [
        "2022",
        "Cloud providers (AWS CloudWatch, Google Cloud Monitoring, Datadog) add first-class SLI/SLO tracking as managed service features"
      ]
    ],
    "timeline_zh": [
      [
        "2003",
        "Google SRE团队开始将服务可靠性测量正式化为内部实践，开发SLI/SLO/SLA框架"
      ],
      [
        "2016",
        "O'Reilly出版Google SRE书籍，将SLI/SLO概念引入行业；SRE实践开始在Google之外传播"
      ],
      [
        "2018",
        "《SRE工作手册》出版，为不同服务类型提供具体的SLI实施指导和实例"
      ],
      [
        "2022",
        "云提供商（AWS CloudWatch、Google Cloud Monitoring、Datadog）将SLI/SLO追踪作为托管服务功能添加"
      ]
    ],
    "dos": [
      "Do define SLIs from the user's perspective: measure what the user experiences (did their request succeed? was it fast enough?) rather than what the infrastructure experiences (is CPU below 80%?)",
      "Do keep SLI definitions simple and unambiguous: a good SLI can be expressed as a single sentence with a clear numerator, denominator, and measurement window",
      "Do validate that your SLI actually correlates with user-reported issues by retrospectively checking whether past incidents would have caused SLI degradation",
      "Do limit the number of SLIs per service to 3-5 key indicators — too many SLIs dilute focus and make it unclear which metric to act on during an incident"
    ],
    "dos_zh": [
      "从用户角度定义SLI：测量用户体验的内容（他们的请求是否成功？是否足够快？）而非基础设施体验的内容（CPU是否低于80%？）",
      "保持SLI定义简单明确：好的SLI可以用一句话表达，包含清晰的分子、分母和测量窗口",
      "通过回顾性检查过去的事故是否会导致SLI降级来验证SLI是否真正与用户报告的问题相关",
      "将每个服务的SLI数量限制在3-5个关键指标——过多的SLI会分散注意力，使事故期间不清楚应对哪个指标采取行动"
    ],
    "donts": [
      "Don't use internal system metrics (CPU, memory, disk) as SLIs — they are implementation details that don't directly measure user experience and often fail to trigger when users are actually impacted",
      "Don't set SLI measurement points deep inside the system (database query time) without also measuring at the service entry point — internal SLIs miss the compounding effects of the full request path",
      "Don't conflate SLI (what you measure) with SLO (what you commit to) — defining them separately allows the same SLI measurement to be used for different SLO targets across environments",
      "Don't change SLI definitions retroactively to make historical data look better — SLI instability destroys trust in the measurement and makes error budget calculations meaningless"
    ],
    "donts_zh": [
      "不要使用内部系统指标（CPU、内存、磁盘）作为SLI——它们是实现细节，不直接测量用户体验，且在用户实际受影响时往往无法触发",
      "不要仅在系统内部深处（数据库查询时间）设置SLI测量点而不在服务入口处测量——内部SLI遗漏了完整请求路径的累积效应",
      "不要将SLI（你测量的内容）与SLO（你承诺的内容）混淆——分别定义它们允许相同的SLI测量用于不同环境的不同SLO目标",
      "不要为了让历史数据看起来更好而回溯修改SLI定义——SLI的不稳定性会破坏对测量的信任，使错误预算计算失去意义"
    ],
    "case_study_company": "Google",
    "case_study": "Google's Search infrastructure team defined their primary SLI as the proportion of search queries that return a valid results page with at least one result in under 200ms, measured at the global load balancer — not at any individual backend component. This single SLI captured the end-to-end user experience across the entire search stack. When a backend ranking service degraded, the SLI immediately showed impact even when all individual component health checks appeared green. The SLI definition allowed Google to set a 99.99% SLO for search latency, and the corresponding error budget of 4.38 minutes/month gave the SRE team a quantitative basis for negotiating feature freeze during high-risk release windows.",
    "case_study_zh": "Google搜索基础设施团队将其主要SLI定义为在200毫秒内返回包含至少一条结果的有效结果页面的搜索查询比例，在全球负载均衡器处测量——而非任何单个后端组件处。这单个SLI捕获了整个搜索堆栈的端到端用户体验。当后端排序服务降级时，即使所有单独组件的健康检查显示正常，SLI也立即显示出影响。SLI定义使Google能够为搜索延迟设置99.99%的SLO，相应的每月4.38分钟错误预算为SRE团队提供了在高风险发布窗口期间协商功能冻结的量化依据。",
    "when_not_to_use": [
      "Internal batch processing systems with no direct user interaction where response-time SLIs are meaningless and throughput or completion-rate SLIs better reflect the system's purpose",
      "Very early-stage systems where instrumentation infrastructure does not yet exist and the cost of adding it exceeds the value of measurement at that stage",
      "Systems with inherently variable behavior (scientific computing, ML training jobs) where SLI measurement windows and thresholds cannot be meaningfully defined"
    ],
    "when_not_to_use_zh": [
      "没有直接用户交互的内部批处理系统，响应时间SLI无意义，吞吐量或完成率SLI更能反映系统目的",
      "埋点基础设施尚不存在的极早期系统，此阶段添加埋点的成本超过测量的价值",
      "具有固有可变行为的系统（科学计算、ML训练任务），无法有意义地定义SLI测量窗口和阈值"
    ],
    "adopters": [
      "Google",
      "Spotify",
      "LinkedIn",
      "Dropbox",
      "PagerDuty",
      "Atlassian"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "observability",
      "performance"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Beyer, B., Jones, C., Petoff, J., Murphy, N.R. (2016). \"Site Reliability Engineering: How Google Runs Production Systems\". O'Reilly Media.",
    "secondary_sources": [
      "Beyer, B., Murphy, N.R., Rensin, D., Kawahara, K., Thorne, S. (2018). \"The Site Reliability Workbook\". O'Reilly Media.",
      "Google Cloud (2023). \"SLO Monitoring and Alerting in Cloud Monitoring\". cloud.google.com.",
      "CRE Life Lessons (2017). \"Understanding SLOs and SLAs\". Google Cloud Blog."
    ],
    "typed_relations": [
      {
        "slug": "sli-slo-sla",
        "type": "related"
      },
      {
        "slug": "four-golden-signals",
        "type": "complement"
      }
    ]
  },
  {
    "id": 274,
    "name": "Synthetic Monitoring",
    "name_zh": "合成监控",
    "slug": "synthetic-monitoring",
    "category": "observability",
    "desc": "Proactive testing of user journeys by scripting interactions from external locations to detect failures before real users are impacted",
    "desc_zh": "通过从外部地点脚本化用户交互来主动测试用户旅程，在真实用户受到影响之前检测故障",
    "steps": [
      "Identify the critical user journeys to monitor: prioritize flows that directly affect revenue or user retention (login, checkout, core API health checks) rather than trying to cover every page",
      "Write interaction scripts using headless browser tools (Playwright, Selenium, Puppeteer) or API test scripts that mimic real user behavior including authentication, form submissions, and assertions on response content",
      "Deploy monitors from multiple geographic locations to detect regional failures, CDN issues, and latency variations that affect specific user populations but not global availability metrics",
      "Configure alert thresholds: set failure alerts (3 consecutive failures from 2+ locations) and latency alerts (P95 > 2x baseline) with clear runbooks describing what to check when each monitor fails",
      "Integrate synthetic monitor results into your SLO dashboards as an external availability signal, complementing real-user monitoring with proactive coverage for low-traffic paths and off-peak hours"
    ],
    "steps_zh": [
      "识别要监控的关键用户旅程：优先监控直接影响收入或用户留存的流程（登录、结账、核心API健康检查），而非试图覆盖每个页面",
      "使用无头浏览器工具（Playwright、Selenium、Puppeteer）或API测试脚本编写交互脚本，模拟真实用户行为，包括认证、表单提交和响应内容断言",
      "从多个地理位置部署监控器，以检测影响特定用户群但不影响全局可用性指标的区域性故障、CDN问题和延迟变化",
      "配置告警阈值：设置故障告警（来自2个以上地点的连续3次失败）和延迟告警（P95>基线2倍），并附有清晰的运行手册描述每个监控器失败时应检查的内容",
      "将合成监控结果作为外部可用性信号集成到SLO仪表盘中，用主动覆盖低流量路径和非高峰时段来补充真实用户监控"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Critical Journeys",
      "Interaction Scripts",
      "Multi-Region",
      "Alert Thresholds",
      "SLO Integrate"
    ],
    "viz_labels_zh": [
      "关键旅程",
      "交互脚本",
      "多地域",
      "告警阈值",
      "SLO集成"
    ],
    "related": [
      "service-level-indicators",
      "sli-slo-sla",
      "distributed-tracing",
      "chaos-engineering"
    ],
    "tags": [
      "synthetic-monitoring",
      "observability",
      "testing",
      "availability",
      "uptime",
      "catchpoint",
      "playwright"
    ],
    "origin_author": "Catchpoint",
    "origin_source": "Catchpoint Systems product documentation and industry practice, 2008",
    "origin_source_zh": "Catchpoint Systems产品文档及行业实践，2008年",
    "complexity": "intermediate",
    "when_to_use": [
      "Monitoring critical user journeys that have low natural traffic volume — synthetic tests ensure these paths are continuously exercised even during off-peak hours",
      "Detecting geographic or ISP-specific outages before real users in affected regions encounter them and before support tickets start arriving",
      "Establishing a baseline availability SLI for external-facing services where you need an objective, external perspective on service availability rather than relying solely on internal metrics",
      "Validating that deployments haven't broken critical flows immediately after release, providing faster feedback than waiting for real user error rates to climb"
    ],
    "when_to_use_zh": [
      "监控自然流量较少的关键用户旅程——合成测试确保这些路径即使在非高峰时段也能持续被测试",
      "在受影响地区的真实用户遭遇故障并在支持工单开始涌入之前，检测地理或ISP特定的中断",
      "为面向外部的服务建立基线可用性SLI，需要对服务可用性有客观的外部视角，而不仅依赖内部指标",
      "在发布后立即验证部署没有破坏关键流程，提供比等待真实用户错误率上升更快的反馈"
    ],
    "core_concepts": [
      "Transaction Monitoring: scripted multi-step user flows (login → add to cart → checkout) that run on a schedule from multiple locations, asserting that each step completes within acceptable time and produces the expected result",
      "API Monitoring: HTTP-level checks that verify endpoint availability, response time, status codes, and response body content, providing fast and lightweight coverage for backend services",
      "Real Browser Testing: executing scripts in a full browser engine (Chrome headless, Firefox) to capture client-side JavaScript performance, rendering issues, and third-party dependency failures",
      "Multi-location Probing: running the same script from geographically distributed probe nodes (cloud regions, ISP vantage points) to distinguish local outages from global failures and measure regional latency",
      "Alert Correlation: combining synthetic monitor failures with APM traces and log events to accelerate root cause analysis — a synthetic failure timestamp narrows the search window for correlated anomalies"
    ],
    "core_concepts_zh": [
      "事务监控：脚本化的多步骤用户流程（登录→加入购物车→结账），按计划从多个地点运行，断言每个步骤在可接受时间内完成并产生预期结果",
      "API监控：验证端点可用性、响应时间、状态码和响应体内容的HTTP级检查，为后端服务提供快速轻量的覆盖",
      "真实浏览器测试：在完整浏览器引擎（Chrome headless、Firefox）中执行脚本，捕获客户端JavaScript性能、渲染问题和第三方依赖故障",
      "多地点探测：从地理分布的探测节点（云区域、ISP观测点）运行相同脚本，区分本地中断与全局故障，测量区域延迟",
      "告警关联：将合成监控失败与APM追踪和日志事件结合，加速根本原因分析——合成失败时间戳缩小了关联异常的搜索窗口"
    ],
    "timeline": [
      [
        "2000",
        "Keynote Systems (later Dynatrace) launches one of the first commercial synthetic monitoring services, testing web availability from external locations"
      ],
      [
        "2008",
        "Catchpoint founded, advancing synthetic monitoring with global probe networks and multi-step transaction testing"
      ],
      [
        "2015",
        "Datadog and New Relic introduce synthetic monitoring as integrated features within their APM platforms, democratizing access"
      ],
      [
        "2020",
        "Playwright and Puppeteer adoption drives headless browser synthetic testing into CI/CD pipelines, blurring the line between testing and monitoring"
      ]
    ],
    "timeline_zh": [
      [
        "2000",
        "Keynote Systems（后来的Dynatrace）推出最早的商业合成监控服务之一，从外部位置测试Web可用性"
      ],
      [
        "2008",
        "Catchpoint成立，通过全球探测网络和多步骤事务测试推进合成监控"
      ],
      [
        "2015",
        "Datadog和New Relic将合成监控作为APM平台的集成功能引入，使访问民主化"
      ],
      [
        "2020",
        "Playwright和Puppeteer的采用推动无头浏览器合成测试进入CI/CD管道，模糊了测试与监控之间的界限"
      ]
    ],
    "dos": [
      "Do run synthetic monitors from at least 3 geographically distinct locations so that a single probe node failure doesn't trigger false positive alerts",
      "Do write synthetic scripts that use dedicated test accounts with predictable data states rather than real user accounts, to avoid polluting production data or triggering business-side workflows",
      "Do configure monitors to run frequently enough to detect issues promptly (every 1-5 minutes for critical paths) but not so frequently that you generate excessive API load or cost",
      "Do version-control your synthetic monitor scripts alongside application code so changes to the application UI or API are reflected in updated monitor scripts through the same review process"
    ],
    "dos_zh": [
      "从至少3个地理位置不同的地点运行合成监控器，以避免单个探测节点故障触发误报告警",
      "编写使用专用测试账户（具有可预测数据状态）而非真实用户账户的合成脚本，以避免污染生产数据或触发业务侧工作流",
      "将监控器配置为足够频繁地运行以及时检测问题（关键路径每1-5分钟一次），但不要过于频繁导致生成过多API负载或成本",
      "将合成监控脚本与应用代码一起纳入版本控制，以便通过相同的评审流程在更新的监控脚本中反映应用UI或API的变更"
    ],
    "donts": [
      "Don't rely solely on synthetic monitoring as your availability signal — synthetic tests probe a subset of user journeys from scripted paths; real user monitoring (RUM) captures the long tail of actual usage patterns",
      "Don't write brittle scripts that depend on specific UI element positions or text content that changes frequently — use stable data attributes or API contracts as assertion targets",
      "Don't alert on a single synthetic failure from a single location — transient network issues cause false positives; require multiple consecutive failures from multiple locations before paging on-call",
      "Don't neglect to update synthetic scripts when the application changes — stale scripts that test deprecated flows provide false confidence and miss regressions in the new implementation"
    ],
    "donts_zh": [
      "不要仅依赖合成监控作为可用性信号——合成测试从脚本化路径探测用户旅程的子集；真实用户监控（RUM）捕获实际使用模式的长尾",
      "不要编写依赖频繁变化的特定UI元素位置或文本内容的脆弱脚本——使用稳定的数据属性或API契约作为断言目标",
      "不要对单个地点的单次合成失败发出告警——瞬态网络问题会导致误报；在通知待命人员之前要求来自多个地点的多次连续失败",
      "不要在应用变更时忽视更新合成脚本——测试已废弃流程的过时脚本提供虚假的信心，并遗漏新实现中的回归"
    ],
    "case_study_company": "Shopify",
    "case_study": "During Black Friday 2021, Shopify's platform team used synthetic monitoring to continuously exercise the end-to-end checkout flow for multiple merchant store types from 12 geographic probe locations. When a payment gateway integration began experiencing elevated latency in the EU-West region at 2:47 AM EST — well before European merchants' peak traffic — the synthetic monitors triggered an alert within 4 minutes of degradation onset. The on-call SRE used the synthetic transaction trace (which captured the exact HTTP exchange at the payment step) to identify that a third-party payment provider's EU endpoint was returning slow responses. The team activated a backup payment routing configuration before EU merchants opened for business, preventing what would have been a €2M+ revenue impact during the critical trading window.",
    "case_study_zh": "在2021年黑色星期五期间，Shopify平台团队使用合成监控从12个地理探测位置持续测试多种商家店铺类型的端到端结账流程。当支付网关集成在欧洲西部地区于美东时间凌晨2:47开始出现延迟升高时——远早于欧洲商家的流量高峰——合成监控器在降级发生后4分钟内触发告警。待命SRE使用合成事务追踪（捕获了支付步骤的确切HTTP交换）识别出第三方支付提供商的欧洲端点正在返回慢响应。团队在欧洲商家开始营业前激活了备用支付路由配置，避免了关键交易窗口期间超过200万欧元的潜在收入损失。",
    "when_not_to_use": [
      "Internal-only services with no external-facing endpoints where external probe locations have no network path to reach the service",
      "Highly dynamic single-page applications with frequent A/B test UI changes where maintaining synthetic scripts becomes more expensive than the coverage they provide",
      "Services with strict security constraints that prohibit external systems from making authenticated requests against production endpoints"
    ],
    "when_not_to_use_zh": [
      "没有面向外部端点的仅内部服务，外部探测位置没有到达该服务的网络路径",
      "具有频繁A/B测试UI变更的高度动态单页应用，维护合成脚本的成本超过其提供的覆盖价值",
      "具有严格安全限制的服务，禁止外部系统对生产端点进行认证请求"
    ],
    "adopters": [
      "Shopify",
      "Amazon",
      "Cloudflare",
      "Datadog",
      "Catchpoint",
      "New Relic"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "performance"
    ],
    "maturity_ring": "established",
    "primary_source": "Catchpoint Systems (2008). \"Synthetic Monitoring: Proactive Performance Management\". catchpoint.com.",
    "secondary_sources": [
      "Datadog (2023). \"Synthetic Monitoring Documentation\". docs.datadoghq.com.",
      "Google (2023). \"Cloud Monitoring Uptime Checks\". cloud.google.com.",
      "Nygard, M.T. (2007). \"Release It! Design and Deploy Production-Ready Software\". Pragmatic Bookshelf."
    ],
    "typed_relations": [
      {
        "slug": "service-level-indicators",
        "type": "complement"
      },
      {
        "slug": "chaos-engineering",
        "type": "related"
      },
      {
        "slug": "distributed-tracing",
        "type": "complement"
      }
    ]
  },
  {
    "id": 275,
    "name": "Log Aggregation Patterns",
    "name_zh": "日志聚合模式",
    "slug": "log-aggregation-patterns",
    "category": "observability",
    "desc": "Centralized collection, parsing, and querying of logs from distributed services using platforms like ELK, Loki, and Datadog",
    "desc_zh": "使用ELK、Loki和Datadog等平台对分布式服务的日志进行集中收集、解析和查询",
    "steps": [
      "Standardize log format across all services: adopt structured JSON logging with consistent fields (timestamp, level, service, trace_id, request_id, user_id) so logs can be parsed and queried without custom per-service rules",
      "Choose and deploy a log aggregation stack: ELK (Elasticsearch + Logstash + Kibana) for full-text search and analytics, Loki for label-based querying with lower storage cost, or a managed platform like Datadog Logs for operational simplicity",
      "Configure log shippers on each host or container (Filebeat, Fluentd, Promtail, Vector) to tail log files or consume stdout and forward to the central aggregation system with service metadata enrichment",
      "Define log retention policies by log category: security and audit logs typically require 1-7 years; application debug logs can be retained for 7-30 days; balance queryability with storage cost",
      "Correlate logs with traces and metrics: inject trace IDs into every log line and configure your observability backend to pivot from a log entry to its associated distributed trace for rapid incident diagnosis"
    ],
    "steps_zh": [
      "跨所有服务标准化日志格式：采用带有一致字段（timestamp、level、service、trace_id、request_id、user_id）的结构化JSON日志，使日志无需自定义按服务规则即可解析和查询",
      "选择并部署日志聚合技术栈：ELK（Elasticsearch+Logstash+Kibana）用于全文搜索和分析，Loki用于基于标签的查询且存储成本更低，或使用Datadog日志等托管平台以降低运维复杂度",
      "在每台主机或容器上配置日志采集器（Filebeat、Fluentd、Promtail、Vector），以跟踪日志文件或消费stdout，并通过服务元数据丰富后转发到中央聚合系统",
      "按日志类别定义日志保留策略：安全和审计日志通常需要保留1-7年；应用调试日志可保留7-30天；在可查询性与存储成本之间取得平衡",
      "将日志与追踪和指标关联：在每条日志行中注入Trace ID，并配置可观测性后端以从日志条目跳转到其关联的分布式追踪，用于快速事故诊断"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Standardize Format",
      "Aggregation Stack",
      "Log Shippers",
      "Retention Policy",
      "Trace Correlate"
    ],
    "viz_labels_zh": [
      "标准格式",
      "聚合栈",
      "日志采集",
      "保留策略",
      "与追踪关联"
    ],
    "related": [
      "opentelemetry",
      "distributed-tracing",
      "structured-logging",
      "four-golden-signals",
      "observability-as-code"
    ],
    "tags": [
      "logging",
      "elk",
      "loki",
      "datadog",
      "observability",
      "log-aggregation",
      "structured-logging"
    ],
    "origin_author": "Jay Kreps",
    "origin_source": "Kreps, J. (2013). \"I Heart Logs: Event Data, Stream Processing, and Data Integration\". O'Reilly Media.",
    "origin_source_zh": "Kreps, J.（2013）.《I Heart Logs：事件数据、流处理与数据集成》. O'Reilly Media.",
    "complexity": "intermediate",
    "when_to_use": [
      "Operating distributed systems with more than 3-4 services where correlating events across individual service logs becomes impractical without centralized search",
      "Compliance and audit requirements mandate centralized, tamper-evident log storage with defined retention periods and access controls",
      "Incident response times are slow because engineers spend hours SSH-ing into individual servers to grep through logs rather than querying a central log store",
      "Security operations teams need to detect patterns across log sources (failed logins across multiple services, unusual API access patterns) that are impossible to see in siloed logs"
    ],
    "when_to_use_zh": [
      "运营拥有超过3-4个服务的分布式系统，在没有集中搜索的情况下，关联各个服务日志中的事件变得不切实际",
      "合规和审计要求要求集中、防篡改的日志存储，具有明确的保留期和访问控制",
      "事故响应时间缓慢，因为工程师花费数小时SSH登录各个服务器通过grep查找日志，而非查询中央日志存储",
      "安全运营团队需要检测跨日志源的模式（多个服务的登录失败、异常API访问模式），这在孤立日志中是不可能看到的"
    ],
    "core_concepts": [
      "Log Shipper: a lightweight agent (Filebeat, Fluentd, Promtail, Vector) deployed alongside each service that tails log output, applies parsing and enrichment, and forwards logs to the central aggregation backend with minimal overhead",
      "Index vs Label-based Storage: Elasticsearch indexes log content for full-text search (high cost, high flexibility); Loki uses label-indexed streams (low cost, queried by metadata first then content) — the choice drives cost and query patterns",
      "Structured Logging: emitting logs as JSON objects with typed fields rather than freeform strings, enabling precise filtering (level=ERROR AND service=checkout AND user_id=12345) without fragile regex parsing",
      "Log Correlation: embedding trace_id and span_id in every log line enables jumping from a log event to its parent distributed trace in Jaeger/Zipkin, turning logs from isolated events into a navigable signal",
      "Retention Tiering: hot storage (7-30 days, fast query) → warm storage (30-90 days, slower) → cold/archive (compliance retention, rarely queried) — matching query frequency to storage cost"
    ],
    "core_concepts_zh": [
      "日志采集器：轻量级代理（Filebeat、Fluentd、Promtail、Vector），与每个服务一起部署，跟踪日志输出，应用解析和丰富，并以最小开销将日志转发到中央聚合后端",
      "索引型与标签型存储：Elasticsearch对日志内容建立全文搜索索引（高成本、高灵活性）；Loki使用标签索引流（低成本，先按元数据后按内容查询）——选择决定了成本和查询模式",
      "结构化日志：以带有类型化字段的JSON对象而非自由格式字符串发出日志，支持精确过滤（level=ERROR AND service=checkout AND user_id=12345），无需脆弱的正则表达式解析",
      "日志关联：在每条日志行中嵌入trace_id和span_id，支持从日志事件跳转到Jaeger/Zipkin中其父级分布式追踪，将日志从孤立事件转变为可导航信号",
      "保留分层：热存储（7-30天，快速查询）→温存储（30-90天，较慢）→冷/归档存储（合规保留，很少查询）——将查询频率与存储成本匹配"
    ],
    "timeline": [
      [
        "2004",
        "Syslog becomes the de facto standard for Unix log aggregation, but its unstructured format limits machine parsing at scale"
      ],
      [
        "2010",
        "Elasticsearch released; combined with Logstash and Kibana as the ELK stack, it becomes the dominant open-source log analytics platform"
      ],
      [
        "2013",
        "Jay Kreps publishes 'I Heart Logs', articulating logs as the universal data integration primitive and influencing stream-based log architectures"
      ],
      [
        "2018",
        "Grafana Loki announced at KubeCon NA, introducing a label-based approach inspired by Prometheus that dramatically reduces log storage cost for Kubernetes environments"
      ]
    ],
    "timeline_zh": [
      [
        "2004",
        "Syslog成为Unix日志聚合的事实标准，但其非结构化格式限制了大规模机器解析"
      ],
      [
        "2010",
        "Elasticsearch发布；与Logstash和Kibana组合为ELK技术栈，成为主流的开源日志分析平台"
      ],
      [
        "2013",
        "Jay Kreps发布《I Heart Logs》，将日志阐述为通用数据集成原语，影响了基于流的日志架构"
      ],
      [
        "2018",
        "在KubeCon NA上宣布Grafana Loki，引入受Prometheus启发的基于标签的方法，大幅降低了Kubernetes环境的日志存储成本"
      ]
    ],
    "dos": [
      "Do enforce structured JSON logging as a service standard — require it in code review and lint for freeform log strings; unstructured logs become ungrepable liabilities at scale",
      "Do inject a correlation ID (trace_id or request_id) at the request entry point and propagate it through all downstream service calls so any log line can be used to find all related events",
      "Do set log levels deliberately: use DEBUG for development, INFO for significant application events, WARN for recoverable issues, ERROR for failures requiring attention — and configure production to ship only INFO and above by default",
      "Do test your log aggregation pipeline in a staging environment that mirrors production load, as high-cardinality label explosions in Loki or large index mappings in Elasticsearch often only surface at production volume"
    ],
    "dos_zh": [
      "将结构化JSON日志作为服务标准强制执行——在代码审查中要求它并对自由格式日志字符串进行lint检查；非结构化日志在规模化时成为无法grep的负担",
      "在请求入口点注入关联ID（trace_id或request_id）并将其传播到所有下游服务调用，使任何日志行都可用于查找所有相关事件",
      "刻意设置日志级别：开发使用DEBUG，重要应用事件使用INFO，可恢复问题使用WARN，需要关注的故障使用ERROR——默认情况下配置生产环境仅发送INFO及以上级别",
      "在模拟生产负载的预发布环境中测试日志聚合管道，因为Loki中的高基数标签爆炸或Elasticsearch中的大型索引映射通常只在生产规模下才会出现"
    ],
    "donts": [
      "Don't log sensitive data (passwords, tokens, PII, card numbers) even at DEBUG level — build automated PII detection into your log shipper pipeline as a safety net for accidental data leakage",
      "Don't use high-cardinality values (user IDs, request IDs) as Loki labels or Elasticsearch field names that are indexed — they cause label explosion or mapping explosion that degrades query performance and increases cost",
      "Don't treat all logs equally in retention policy — storing debug-level logs for years is expensive and rarely valuable; define tiered retention policies based on log category and compliance requirement",
      "Don't skip centralized log sampling for very high-volume debug logging paths — 100% log capture at millions of requests per second is prohibitively expensive; sample at the shipper level for non-critical paths"
    ],
    "donts_zh": [
      "不要记录敏感数据（密码、令牌、PII、卡号），即使在DEBUG级别——在日志采集器管道中构建自动化PII检测作为意外数据泄露的安全网",
      "不要使用高基数值（用户ID、请求ID）作为Loki标签或被索引的Elasticsearch字段名——它们会导致标签爆炸或映射爆炸，降低查询性能并增加成本",
      "不要在保留策略中平等对待所有日志——将调试级别日志存储多年既昂贵又很少有价值；根据日志类别和合规要求定义分层保留策略",
      "不要跳过对非常高流量调试日志路径的集中日志采样——在每秒数百万请求时100%日志捕获成本高得令人望而却步；在采集器级别对非关键路径进行采样"
    ],
    "case_study_company": "Airbnb",
    "case_study": "Airbnb migrated from host-based log tailing to a centralized ELK-based log aggregation platform as their microservices architecture grew to over 200 services. The migration standardized log format to structured JSON across all services and deployed Filebeat sidecars in their Kubernetes pods. The key outcome was incident response: before centralization, identifying the root cause of a checkout failure required 45-90 minutes of log correlation across 8+ services. After deploying centralized log search with trace ID correlation, the same investigation took under 10 minutes. Airbnb also implemented automated log-based anomaly detection that caught 3 silent data integrity issues in 6 months that had no corresponding metric alerts — bugs that were writing incorrect data to the database without failing loudly enough to trigger error rate SLOs.",
    "case_study_zh": "随着Airbnb的微服务架构增长到200多个服务，他们从基于主机的日志跟踪迁移到基于ELK的集中式日志聚合平台。迁移将所有服务的日志格式标准化为结构化JSON，并在Kubernetes Pod中部署Filebeat旁路容器。关键成果在于事故响应：集中化之前，识别结账失败的根本原因需要在8个以上服务间关联日志45-90分钟。部署带Trace ID关联的集中式日志搜索后，同样的调查只需不到10分钟。Airbnb还实现了基于日志的自动异常检测，在6个月内发现了3个无对应指标告警的静默数据完整性问题——这些bug在向数据库写入错误数据时没有大声地失败到足以触发错误率SLO。",
    "when_not_to_use": [
      "Single-service monolithic applications where application logs are accessible on one or a few hosts and the operational overhead of a log aggregation stack outweighs the convenience benefit",
      "Strictly air-gapped environments where regulations prohibit sending log data to any external or centralized system, requiring per-host log analysis tooling instead",
      "Ultra-low-latency systems where the overhead of structured JSON logging and log shipper processes is a measurable performance bottleneck on the critical path"
    ],
    "when_not_to_use_zh": [
      "单服务单体应用，应用日志可在一台或几台主机上访问，日志聚合技术栈的运维开销超过便利收益",
      "严格的物理隔离环境，法规禁止将日志数据发送到任何外部或集中式系统，需要使用按主机的日志分析工具",
      "超低延迟系统，结构化JSON日志和日志采集器进程的开销在关键路径上是可测量的性能瓶颈"
    ],
    "adopters": [
      "Airbnb",
      "Uber",
      "Netflix",
      "Wikimedia Foundation",
      "Elastic (ELK Stack)",
      "Grafana Labs (Loki)"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "observability",
      "security",
      "maintainability"
    ],
    "maturity_ring": "foundational",
    "primary_source": "Kreps, J. (2013). \"I Heart Logs: Event Data, Stream Processing, and Data Integration\". O'Reilly Media.",
    "secondary_sources": [
      "Elastic (2023). \"Elasticsearch Guide: Log and Event Data\". elastic.co/guide.",
      "Grafana Labs (2023). \"Loki Documentation: Log Aggregation System for Kubernetes\". grafana.com/docs/loki.",
      "Sridharan, C. (2018). \"Distributed Systems Observability\". O'Reilly Media."
    ],
    "typed_relations": [
      {
        "slug": "opentelemetry",
        "type": "complement"
      },
      {
        "slug": "distributed-tracing",
        "type": "complement"
      },
      {
        "slug": "observability-as-code",
        "type": "related"
      }
    ]
  },
  {
    "id": 276,
    "name": "Canary Analysis",
    "name_zh": "金丝雀分析",
    "slug": "canary-analysis",
    "category": "observability",
    "desc": "Automated comparison of canary vs baseline metrics to quantitatively validate deployments before full rollout",
    "desc_zh": "自动比较金丝雀与基线指标，在全量发布前定量验证部署",
    "steps": [
      "Deploy the canary version to a small percentage of traffic (1-5%) alongside the current baseline, using feature flags, traffic splitting, or a canary deployment controller to ensure both receive comparable request distributions",
      "Define the metrics to compare: select SLIs that directly reflect user experience (request success rate, P99 latency, error rate by type) and business metrics (conversion rate, checkout completion) for the service under analysis",
      "Run automated canary analysis using a tool like Spinnaker's Kayenta or Argo Rollouts to statistically compare canary vs baseline metric distributions over a defined bake time (typically 30-60 minutes)",
      "Apply a scoring model: each metric comparison produces a pass/fail/warn result; a weighted aggregate score determines whether the canary is promoted, paused, or automatically rolled back",
      "Act on the analysis result: automatically promote healthy canaries (score above threshold) to 100% of traffic, pause inconclusive canaries for human review, and automatically roll back canaries that score below the failure threshold"
    ],
    "steps_zh": [
      "将金丝雀版本部署到一小部分流量（1-5%），与当前基线并行运行，使用功能标志、流量分割或金丝雀部署控制器确保两者接收可比的请求分布",
      "定义要比较的指标：选择直接反映用户体验（请求成功率、P99延迟、按类型的错误率）和业务指标（转化率、结账完成率）的SLI用于分析",
      "使用Spinnaker的Kayenta或Argo Rollouts等工具运行自动化金丝雀分析，在定义的烘焙时间（通常30-60分钟）内对金丝雀与基线指标分布进行统计比较",
      "应用评分模型：每次指标比较产生通过/失败/警告结果；加权综合得分决定金丝雀是晋级、暂停还是自动回滚",
      "根据分析结果采取行动：自动将健康金丝雀（得分高于阈值）晋级到100%流量，暂停不确定的金丝雀供人工审查，自动回滚得分低于失败阈值的金丝雀"
    ],
    "ai_relevant": true,
    "viz_type": "flow",
    "viz_labels": [
      "Canary Deploy",
      "Metric Compare",
      "Auto Analysis",
      "Scoring Model",
      "Promote / Rollback"
    ],
    "viz_labels_zh": [
      "金丝雀部署",
      "指标对比",
      "自动分析",
      "评分模型",
      "晋级回滚"
    ],
    "related": [
      "feature-flags",
      "blue-green-deployment",
      "sli-slo-sla",
      "service-level-indicators",
      "chaos-engineering",
      "progressive-delivery"
    ],
    "tags": [
      "canary",
      "deployment",
      "progressive-delivery",
      "netflix",
      "spinnaker",
      "kayenta",
      "rollout",
      "observability"
    ],
    "origin_author": "Netflix",
    "origin_source": "Netflix Tech Blog: Automated Canary Analysis at Netflix with Kayenta (2018)",
    "origin_source_zh": "Netflix技术博客：使用Kayenta在Netflix进行自动化金丝雀分析（2018）",
    "complexity": "advanced",
    "when_to_use": [
      "Deploying changes to high-traffic, revenue-critical services where the blast radius of a bad deployment justifies the additional deployment complexity of traffic splitting and automated analysis",
      "Teams that have experienced production incidents caused by deployments that passed all pre-production tests but introduced subtle regressions only visible at production traffic patterns",
      "Organizations practicing continuous deployment where human approval of every deployment is a bottleneck — canary analysis automates the go/no-go decision with statistical rigor",
      "Services with clear, stable SLIs where the baseline metric distributions are well-understood and anomalies are distinguishable from normal variance"
    ],
    "when_to_use_zh": [
      "为高流量、对收入至关重要的服务部署变更，其中不良部署的影响范围足以证明流量分割和自动化分析的额外部署复杂度是合理的",
      "曾经历因部署导致生产事故的团队，这些部署通过了所有预生产测试，但引入了只在生产流量模式下才可见的细微回归",
      "实践持续部署的组织，每次部署的人工审批是瓶颈——金丝雀分析以统计严谨性自动化去/不去决策",
      "具有清晰、稳定SLI的服务，基线指标分布已充分理解，异常可与正常方差区分"
    ],
    "core_concepts": [
      "Traffic Splitting: routing a configurable percentage (1-10%) of production traffic to the canary version while the remainder continues to the stable baseline, enabling real-production comparison without full rollout risk",
      "Statistical Comparison: using Mann-Whitney U tests, effect size analysis, or Bayesian methods to determine whether observed metric differences between canary and baseline are statistically significant or within normal variance",
      "Bake Time: the duration the canary receives production traffic before analysis is finalized — long enough to collect statistically significant samples across different traffic conditions, typically 30 minutes to 2 hours",
      "Scoring Model: a weighted formula that aggregates individual metric comparisons into an overall canary health score; critical metrics (error rate) have higher weights than informational metrics",
      "Automated Rollback: when canary score falls below a configured failure threshold, the deployment system automatically shifts 100% of traffic back to the stable baseline without human intervention, minimizing mean time to recover (MTTR)"
    ],
    "core_concepts_zh": [
      "流量分割：将可配置比例（1-10%）的生产流量路由到金丝雀版本，其余流量继续流向稳定基线，实现真实生产比较而不承担全量发布风险",
      "统计比较：使用Mann-Whitney U检验、效应量分析或贝叶斯方法确定金丝雀与基线之间观察到的指标差异是否具有统计显著性，或是否在正常方差范围内",
      "烘焙时间：金丝雀接收生产流量直到分析完成的持续时间——足够长以在不同流量条件下收集统计显著的样本，通常为30分钟到2小时",
      "评分模型：将各个指标比较汇总为整体金丝雀健康得分的加权公式；关键指标（错误率）的权重高于信息性指标",
      "自动回滚：当金丝雀得分低于配置的失败阈值时，部署系统自动将100%流量切回稳定基线，无需人工干预，最小化平均恢复时间（MTTR）"
    ],
    "timeline": [
      [
        "2009",
        "Netflix begins manually reviewing canary deployments as part of their continuous delivery practice, routing small traffic percentages to new builds before full rollout"
      ],
      [
        "2016",
        "Netflix develops Kayenta internally as an automated canary analysis service integrated with Spinnaker, formalizing statistical comparison of canary vs baseline metrics"
      ],
      [
        "2018",
        "Netflix open-sources Kayenta; Google partners with Netflix to release it as part of Spinnaker, making automated canary analysis accessible to the broader industry"
      ],
      [
        "2021",
        "Argo Rollouts adds canary analysis with Kayenta and Prometheus integration, bringing automated canary analysis natively into Kubernetes GitOps workflows"
      ]
    ],
    "timeline_zh": [
      [
        "2009",
        "Netflix开始在持续交付实践中手动审查金丝雀部署，在全量发布前将小比例流量路由到新构建版本"
      ],
      [
        "2016",
        "Netflix内部开发Kayenta作为与Spinnaker集成的自动化金丝雀分析服务，将金丝雀与基线指标的统计比较正式化"
      ],
      [
        "2018",
        "Netflix开源Kayenta；Google与Netflix合作将其作为Spinnaker的一部分发布，使自动化金丝雀分析对更广泛的行业可用"
      ],
      [
        "2021",
        "Argo Rollouts添加具有Kayenta和Prometheus集成的金丝雀分析，将自动化金丝雀分析原生引入Kubernetes GitOps工作流"
      ]
    ],
    "dos": [
      "Do ensure canary and baseline receive statistically comparable traffic: both should see similar request distributions (same user segments, same API endpoint mix) to avoid comparison bias",
      "Do set canary traffic percentage high enough to collect meaningful samples within your bake time, but low enough to limit blast radius — 1-5% is typical for high-traffic services",
      "Do define your canary scoring thresholds based on historical baseline metric variance, not arbitrary percentages — a 5% increase in error rate may be noise for one service and critical for another",
      "Do automate the rollback decision: the primary value of canary analysis is speed of automated rollback — requiring human approval for rollback defeats the purpose for high-velocity deployment pipelines"
    ],
    "dos_zh": [
      "确保金丝雀和基线接收统计上可比的流量：两者应看到类似的请求分布（相同用户群、相同API端点混合），以避免比较偏差",
      "将金丝雀流量百分比设置得足够高以在烘焙时间内收集有意义的样本，但足够低以限制影响范围——1-5%对高流量服务是典型的",
      "根据历史基线指标方差而非任意百分比定义金丝雀评分阈值——5%的错误率增加对某个服务可能是噪音，对另一个服务可能是关键",
      "自动化回滚决策：金丝雀分析的主要价值在于自动化回滚的速度——对高速部署管道来说，要求人工审批回滚会使目的落空"
    ],
    "donts": [
      "Don't use canary analysis as a substitute for pre-production testing — canary analysis catches regressions that only appear at production scale and traffic mix, not issues that should have been caught by unit or integration tests",
      "Don't compare canary against a single baseline instance — compare against the aggregate of all stable instances to account for instance-level variance and avoid false positives from individual instance noise",
      "Don't run canary analysis during atypical traffic periods (scheduled maintenance windows, major events, traffic spikes) when baseline metrics are themselves abnormal",
      "Don't set bake times so short that the canary hasn't experienced enough traffic across different time-of-day patterns to produce statistically valid comparisons"
    ],
    "donts_zh": [
      "不要将金丝雀分析作为预生产测试的替代——金丝雀分析捕获仅在生产规模和流量混合下出现的回归，而非应该被单元或集成测试捕获的问题",
      "不要将金丝雀与单个基线实例比较——与所有稳定实例的聚合进行比较，以考虑实例级方差并避免来自单个实例噪声的误报",
      "不要在非典型流量期间（计划维护窗口、重大事件、流量峰值）运行金丝雀分析，此时基线指标本身就是异常的",
      "不要将烘焙时间设置得太短，使金丝雀没有在足够多的不同时段流量模式下经历足够的流量，从而无法产生统计上有效的比较"
    ],
    "case_study_company": "Netflix",
    "case_study": "Netflix's deployment pipeline uses Kayenta to automatically analyze every production deployment before full rollout. When a Netflix recommendation service deployed a change to its ranking algorithm, Kayenta compared 14 metrics (request latency percentiles, error rates, recommendation click-through rate, stream start failures) between the canary (3% of traffic) and the stable baseline over a 60-minute bake window. The canary scored 94/100 (above the 75/100 promotion threshold), and Spinnaker automatically promoted it to 100% without human intervention. In a separate deployment 2 weeks later, a seemingly minor configuration change caused the P99 latency to increase by 340ms in the canary. Kayenta scored this deployment 23/100, triggering automatic rollback within 8 minutes of the degradation starting — before any users filed complaints and before the on-call team had been paged.",
    "case_study_zh": "Netflix的部署管道使用Kayenta在全量发布前自动分析每次生产部署。当Netflix推荐服务对其排名算法部署变更时，Kayenta在60分钟烘焙窗口内比较了金丝雀（3%流量）与稳定基线之间的14个指标（请求延迟百分位数、错误率、推荐点击率、流媒体启动失败）。金丝雀得分94/100（高于75/100的晋级阈值），Spinnaker自动将其晋级到100%，无需人工干预。两周后的另一次部署中，一个看似微小的配置变更导致金丝雀中P99延迟增加了340ms。Kayenta将该部署评分为23/100，在降级开始后8分钟内触发自动回滚——在任何用户提交投诉之前，也在待命团队被呼叫之前。",
    "when_not_to_use": [
      "Low-traffic services where the canary cannot accumulate enough samples during the bake window to produce statistically valid comparisons — shadow testing or A/B testing with replay traffic may be better alternatives",
      "Services with highly irregular or bursty traffic patterns where baseline metrics have very high natural variance, making it impossible to distinguish canary regressions from normal fluctuation",
      "Deployments with strict data migration dependencies where traffic splitting would cause some requests to hit new code with old data schema and others to hit old code — this requires a coordinated migration strategy, not canary analysis"
    ],
    "when_not_to_use_zh": [
      "低流量服务，金丝雀在烘焙窗口内无法积累足够的样本以产生统计上有效的比较——影子测试或使用回放流量的A/B测试可能是更好的替代方案",
      "流量模式高度不规则或突发的服务，基线指标具有非常高的自然方差，使得无法将金丝雀回归与正常波动区分开",
      "具有严格数据迁移依赖关系的部署，流量分割会导致一些请求使用新代码但旧数据模式，其他请求使用旧代码——这需要协调迁移策略，而非金丝雀分析"
    ],
    "adopters": [
      "Netflix",
      "Google",
      "LinkedIn",
      "Intuit",
      "Pinterest",
      "Waze"
    ],
    "abstraction_level": "system",
    "quality_concerns": [
      "reliability",
      "performance"
    ],
    "maturity_ring": "established",
    "primary_source": "Netflix Tech Blog (2018). \"Automated Canary Analysis at Netflix with Kayenta\". netflixtechblog.com.",
    "secondary_sources": [
      "Google / Netflix (2018). \"Kayenta: An Open Automated Canary Analysis Tool from Google and Netflix\". Spinnaker.io.",
      "Sato, S. (2014). \"Canary Release\". martinfowler.com.",
      "Humble, J. & Farley, D. (2010). \"Continuous Delivery\". Addison-Wesley."
    ],
    "typed_relations": [
      {
        "slug": "service-level-indicators",
        "type": "complement"
      },
      {
        "slug": "feature-flags",
        "type": "related"
      },
      {
        "slug": "blue-green-deployment",
        "type": "related"
      }
    ]
  },
  {
    "id": 311,
    "name": "Service Mesh Observability",
    "name_zh": "服务网格可观测性",
    "slug": "service-mesh-observability",
    "category": "observability",
    "desc": "Leveraging the service mesh data plane (Envoy, Linkerd) to automatically capture golden signal telemetry for every service-to-service call without application code changes",
    "desc_zh": "利用服务网格数据平面（Envoy、Linkerd）自动捕获每个服务间调用的黄金信号遥测数据，无需修改应用代码",
    "steps": [
      "Deploy a service mesh control plane (Istio, Linkerd, or Consul Connect) and inject sidecar proxies into all workloads — the proxies intercept every inbound and outbound connection and emit L4/L7 telemetry automatically",
      "Enable the mesh telemetry pipeline: configure Envoy access logs, Prometheus metrics scraping, and distributed trace propagation (x-b3-traceid headers or W3C Trace Context) to flow into your observability backends",
      "Define service-level traffic policies (retries, timeouts, circuit breaking) in the mesh control plane and correlate these policy events with the telemetry stream to understand how resilience mechanisms affect the four golden signals",
      "Build a topology view by consuming the mesh control plane service registry and combining it with real-time traffic metrics to produce a live service dependency map showing error rates and latency per edge",
      "Set SLO-aligned alerts on mesh-generated metrics (e.g., istio_requests_total, istio_request_duration_milliseconds_bucket) and create runbooks that map mesh observability signals to specific remediation actions"
    ],
    "steps_zh": [
      "部署服务网格控制平面（Istio、Linkerd或Consul Connect）并向所有工作负载注入Sidecar代理——代理拦截所有入站和出站连接，自动发出L4/L7遥测数据",
      "启用网格遥测管道：配置Envoy访问日志、Prometheus指标抓取和分布式追踪传播（x-b3-traceid头或W3C Trace Context）以流入可观测性后端",
      "在网格控制平面定义服务级别流量策略（重试、超时、熔断）并将这些策略事件与遥测流关联，以了解弹性机制如何影响四个黄金信号",
      "通过消费网格控制平面服务注册表并结合实时流量指标，构建拓扑视图，生成显示每条边错误率和延迟的实时服务依赖图",
      "基于网格生成的指标（如istio_requests_total、istio_request_duration_milliseconds_bucket）设置与SLO对齐的告警，并创建将网格可观测性信号映射到具体补救操作的运行手册"
    ],
    "ai_relevant": false,
    "viz_type": "flow",
    "viz_labels": [
      "Deploy Mesh",
      "Telemetry Pipeline",
      "Traffic Policies",
      "Topology View",
      "SLO Alerts"
    ],
    "viz_labels_zh": [
      "部署网格",
      "遥测管道",
      "流量策略",
      "拓扑视图",
      "SLO告警"
    ],
    "related": [
      "opentelemetry",
      "distributed-tracing",
      "four-golden-signals",
      "service-mesh-pattern",
      "slo-as-practice"
    ],
    "tags": [
      "observability",
      "service-mesh",
      "istio",
      "envoy",
      "telemetry",
      "sidecar"
    ],
    "origin_author": "Lyft / Envoy Proxy team; Istio project (Google, IBM, Lyft)",
    "origin_year": 2017,
    "origin_source": "Klein, M. et al. (2017). \"Envoy: C++ L7 proxy and communication bus\". Lyft Engineering Blog.",
    "origin_source_zh": "Klein, M. 等（2017）。《Envoy：C++ L7代理与通信总线》。Lyft工程博客。",
    "complexity": "advanced",
    "abstraction_level": "system",
    "maturity_ring": "established",
    "quality_concerns": [
      "observability",
      "reliability",
      "performance"
    ],
    "adopters": [
      "Lyft",
      "Airbnb",
      "Pinterest",
      "Salesforce",
      "T-Mobile"
    ],
    "when_to_use": [
      "Microservices environments where instrumenting every service individually is operationally impractical or politically difficult",
      "Polyglot architectures where services span multiple languages and a consistent telemetry approach is needed without per-language SDKs",
      "Organizations that need mTLS encryption, traffic shaping, and observability from the same infrastructure layer",
      "Platform teams building internal developer platforms where golden-signal observability must be automatic for all onboarded services"
    ],
    "when_to_use_zh": [
      "单独埋点每个服务在运营上不切实际或政治上困难的微服务环境",
      "服务跨越多种语言且需要统一遥测方式而无需每种语言独立SDK的多语言架构",
      "需要从同一基础设施层获得mTLS加密、流量整形和可观测性的组织",
      "构建内部开发者平台的平台团队，要求所有入驻服务自动具备黄金信号可观测性"
    ],
    "core_concepts": [
      "Sidecar Proxy: A co-located proxy container (typically Envoy) intercepts all network traffic to and from the application container, enabling zero-code-change observability, security, and traffic management",
      "Golden Signals from L7: The mesh proxy automatically tracks request rate, error rate, and latency (RED method) per service pair, plus saturation via connection pool metrics, without any application-side code",
      "Control Plane Telemetry API: The mesh control plane (Istiod, Linkerd control plane) provides a service topology graph and policy-enforcement telemetry that complements the data-plane metrics",
      "mTLS-correlated Tracing: Service mesh can correlate distributed traces with mTLS certificate identities, enabling security audits and per-identity traffic analysis alongside performance observability",
      "Traffic Policy Events: Retries, circuit-breaker state transitions, and outlier ejections emitted by the data plane form an event stream that enriches the observability picture beyond raw request metrics"
    ],
    "core_concepts_zh": [
      "Sidecar代理：共同部署的代理容器（通常是Envoy）拦截应用容器所有进出网络流量，实现零代码变更的可观测性、安全性和流量管理",
      "L7黄金信号：网格代理自动追踪每对服务的请求率、错误率和延迟（RED方法），以及通过连接池指标追踪饱和度，无需应用侧任何代码",
      "控制平面遥测API：网格控制平面（Istiod、Linkerd控制平面）提供服务拓扑图和策略执行遥测，补充数据平面指标",
      "mTLS关联追踪：服务网格可将分布式追踪与mTLS证书身份关联，在性能可观测性之外实现安全审计和按身份的流量分析",
      "流量策略事件：数据平面发出的重试、熔断状态转换和异常弹出形成事件流，在原始请求指标之外丰富可观测性视图"
    ],
    "timeline": [
      [
        2016,
        "Lyft open-sources Envoy proxy; its per-request observability hooks become the foundation of mesh telemetry"
      ],
      [
        2017,
        "Istio 0.1 launched by Google, IBM, and Lyft with built-in Prometheus, Jaeger, and Kiali integrations"
      ],
      [
        2019,
        "Linkerd 2.0 (Buoyant) introduces a Rust-based micro-proxy focused on ultra-low-overhead observability for Kubernetes"
      ],
      [
        2022,
        "Istio joins CNCF; eBPF-based sidecar-less mesh approaches (Cilium, Ambient Mesh) emerge as next-generation observability layer"
      ]
    ],
    "timeline_zh": [
      [
        2016,
        "Lyft开源Envoy代理；其每请求可观测性钩子成为网格遥测的基础"
      ],
      [
        2017,
        "Istio 0.1由Google、IBM和Lyft发布，内置Prometheus、Jaeger和Kiali集成"
      ],
      [
        2019,
        "Linkerd 2.0（Buoyant）引入基于Rust的微代理，专注于Kubernetes的超低开销可观测性"
      ],
      [
        2022,
        "Istio加入CNCF；基于eBPF的无Sidecar网格方案（Cilium、Ambient Mesh）作为下一代可观测性层涌现"
      ]
    ],
    "dos": [
      "Do enable distributed trace context propagation at the mesh level and ensure applications pass through trace headers unchanged, so end-to-end traces span both mesh-generated and application-generated spans",
      "Do use mesh telemetry as the floor, not the ceiling — add application-level spans for business logic details that the proxy cannot observe",
      "Do monitor control plane health separately; a degraded Istiod can cause stale configuration that skews traffic policy metrics",
      "Do implement workload-level RBAC in the mesh and align it with your observability access controls so security and telemetry data share the same identity model"
    ],
    "dos_zh": [
      "在网格级别启用分布式追踪上下文传播，并确保应用原样传递追踪头，以便端到端追踪跨越网格生成的和应用生成的Span",
      "将网格遥测视为基础而非上限——为代理无法观测的业务逻辑细节添加应用级Span",
      "单独监控控制平面健康状况；降级的Istiod可能导致配置过时从而使流量策略指标失真",
      "在网格中实施工作负载级RBAC并与可观测性访问控制对齐，使安全和遥测数据共享同一身份模型"
    ],
    "donts": [
      "Don't assume sidecar observability replaces application-level tracing — the proxy only sees network-level events; business transaction context requires application instrumentation",
      "Don't underestimate sidecar resource overhead: each Envoy sidecar consumes 50-100 MB RAM and adds 1-5 ms latency; benchmark before fleet-wide rollout",
      "Don't ignore cardinality explosions in mesh-generated labels — high-cardinality dimensions (user IDs, request paths) on Prometheus metrics can exhaust memory",
      "Don't route all telemetry through the mesh itself — observability data should bypass the mesh's own retry/timeout policies to avoid circular failure modes"
    ],
    "donts_zh": [
      "不要认为Sidecar可观测性替代了应用级追踪——代理只能看到网络级事件；业务事务上下文需要应用埋点",
      "不要低估Sidecar资源开销：每个Envoy Sidecar消耗50-100 MB内存并增加1-5 ms延迟；在全量部署前进行基准测试",
      "不要忽视网格生成标签的基数爆炸——Prometheus指标上的高基数维度（用户ID、请求路径）可能耗尽内存",
      "不要将所有遥测数据路由通过网格本身——可观测性数据应绕过网格自身的重试/超时策略，以避免循环故障模式"
    ],
    "case_study_company": "Airbnb",
    "case_study": "Airbnb adopted Envoy-based service mesh observability as part of their OneWeb platform initiative. By replacing hundreds of bespoke service-to-service monitoring integrations with mesh-generated RED metrics and distributed traces, they reduced the time to diagnose cross-service latency regressions from hours to minutes. Their platform team exposed a self-service Grafana dashboard template backed entirely by Istio metrics, enabling product teams to achieve production-grade observability on day one of service creation without any instrumentation code.",
    "case_study_zh": "Airbnb在其「OneWeb」平台计划中采用了基于Envoy的服务网格可观测性。通过用网格生成的RED指标和分布式追踪替代数百个定制的服务间监控集成，他们将诊断跨服务延迟回归的时间从数小时缩短至数分钟。他们的平台团队提供了完全基于Istio指标的自助式Grafana仪表盘模板，使产品团队在服务创建第一天就能实现生产级可观测性，无需任何埋点代码。",
    "when_not_to_use": [
      "Monolithic applications where all service calls are in-process — the mesh only observes network traffic, not in-process function calls",
      "Teams without Kubernetes expertise — service meshes introduce significant operational complexity that can outweigh observability benefits for small teams",
      "Latency-critical paths where the extra 2-5 ms of sidecar overhead is unacceptable (high-frequency trading, real-time gaming)",
      "Early-stage startups with fewer than 10 services where the setup cost of a mesh exceeds the debugging value it provides"
    ],
    "when_not_to_use_zh": [
      "所有服务调用都在进程内的单体应用——网格只能观测网络流量，不能观测进程内函数调用",
      "缺乏Kubernetes专业知识的团队——服务网格引入了显著的运营复杂性，对小团队而言可能超过可观测性收益",
      "延迟关键路径中额外的2-5 ms Sidecar开销不可接受的场景（高频交易、实时游戏）",
      "服务数量少于10个的早期创业公司，网格搭建成本超过其提供的调试价值"
    ],
    "primary_source": "Klein, M. et al. (2017). \"Envoy Proxy\". Lyft Engineering. envoyproxy.io",
    "primary_source_zh": "Klein, M. 等（2017）。《Envoy代理》。Lyft工程。envoyproxy.io",
    "secondary_sources": [
      "Calcado, F. (2018). \"What's a service mesh? And why do I need one?\". Buoyant Engineering Blog.",
      "Istio Authors (2017-2024). \"Istio Documentation: Observability\". istio.io/docs/concepts/observability"
    ],
    "secondary_sources_zh": [
      "Calcado, F.（2018）。《什么是服务网格？我为什么需要它？》。Buoyant工程博客。",
      "Istio作者（2017-2024）。《Istio文档：可观测性》。istio.io/docs/concepts/observability"
    ]
  }
]
