{
  "evaluation_id": "EVAL-20260402-135144",
  "question_id": "CODE-018",
  "question_text": "Implement a minimal but correct event sourcing system in Python. Include: an Event base class, an EventStore (in-memory is fine), an Aggregate base class that rebuilds state from events, a BankAccount aggregate with deposit/withdraw/transfer commands, and snapshot support for performance. Demonstrate eventual consistency between two projections.",
  "category": "code",
  "timestamp": "2026-04-02T13:51:44.778Z",
  "display_date": "Apr 02, 2026",
  "winner": {
    "name": "Grok 4.20",
    "provider": "openrouter",
    "score": 7.72
  },
  "avg_score": 6.26,
  "matrix_size": 90,
  "models_used": [
    {
      "id": "gemini_3_flash",
      "name": "Gemini 3 Flash Preview",
      "provider": "Google"
    },
    {
      "id": "claude_opus_46",
      "name": "Claude Opus 4.6",
      "provider": "openrouter"
    },
    {
      "id": "gemini_31_pro",
      "name": "Gemini 3.1 Pro",
      "provider": "openrouter"
    },
    {
      "id": "claude_sonnet_46",
      "name": "Claude Sonnet 4.6",
      "provider": "openrouter"
    },
    {
      "id": "grok_420",
      "name": "Grok 4.20",
      "provider": "openrouter"
    },
    {
      "id": "deepseek_v4",
      "name": "DeepSeek V4",
      "provider": "openrouter"
    },
    {
      "id": "gpt_oss_120b",
      "name": "GPT-OSS-120B",
      "provider": "OpenAI"
    },
    {
      "id": "minimax_m25",
      "name": "MiniMax M2.5",
      "provider": "openrouter"
    },
    {
      "id": "mimo_v2_flash",
      "name": "MiMo-V2-Flash",
      "provider": "Xiaomi"
    },
    {
      "id": "gpt_5_4",
      "name": "GPT-5.4",
      "provider": "openrouter"
    }
  ],
  "rankings": {
    "grok_420": {
      "display_name": "Grok 4.20",
      "provider": "openrouter",
      "average_score": 7.72,
      "score_count": 9,
      "min_score": 6.4,
      "max_score": 9.45,
      "rank": 1
    },
    "gemini_3_flash": {
      "display_name": "Gemini 3 Flash Preview",
      "provider": "Google",
      "average_score": 7.41,
      "score_count": 9,
      "min_score": 6,
      "max_score": 9.6,
      "rank": 2
    },
    "gpt_oss_120b": {
      "display_name": "GPT-OSS-120B",
      "provider": "OpenAI",
      "average_score": 7.33,
      "score_count": 9,
      "min_score": 3.5,
      "max_score": 9.45,
      "rank": 3
    },
    "claude_opus_46": {
      "display_name": "Claude Opus 4.6",
      "provider": "openrouter",
      "average_score": 7.22,
      "score_count": 9,
      "min_score": 3.65,
      "max_score": 9.6,
      "rank": 4
    },
    "gpt_5_4": {
      "display_name": "GPT-5.4",
      "provider": "openrouter",
      "average_score": 6.84,
      "score_count": 9,
      "min_score": 4.6,
      "max_score": 8.6,
      "rank": 5
    },
    "claude_sonnet_46": {
      "display_name": "Claude Sonnet 4.6",
      "provider": "openrouter",
      "average_score": 6.75,
      "score_count": 9,
      "min_score": 3.25,
      "max_score": 9.6,
      "rank": 6
    },
    "deepseek_v4": {
      "display_name": "DeepSeek V4",
      "provider": "openrouter",
      "average_score": 6.19,
      "score_count": 9,
      "min_score": 4.2,
      "max_score": 8.55,
      "rank": 7
    },
    "mimo_v2_flash": {
      "display_name": "MiMo-V2-Flash",
      "provider": "Xiaomi",
      "average_score": 6.13,
      "score_count": 9,
      "min_score": 2.05,
      "max_score": 9.6,
      "rank": 8
    },
    "minimax_m25": {
      "display_name": "MiniMax M2.5",
      "provider": "openrouter",
      "average_score": 3.76,
      "score_count": 9,
      "min_score": 1.65,
      "max_score": 7.8,
      "rank": 9
    },
    "gemini_31_pro": {
      "display_name": "Gemini 3.1 Pro",
      "provider": "openrouter",
      "average_score": 3.25,
      "score_count": 9,
      "min_score": 0.85,
      "max_score": 8.8,
      "rank": 10
    }
  },
  "judgments": [
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 7.45,
      "brief_justification": "The answer provides a functional minimal event sourcing system with clear code and explanations, but it omits the required transfer command, reducing completeness and overall usefulness."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 3.65,
      "brief_justification": "The response starts with a solid event and store design, including optimistic concurrency and snapshots, but it is clearly truncated before completing subscribe(), Aggregate base class, BankAccount commands, transfer logic, snapshot usage, and the eventual consistency projection demo. What is present is fairly clear, but the requested implementation is incomplete and therefore of limited practical use."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 0.85,
      "brief_justification": "The response claims to provide a complete implementation but includes no actual code or substantive content. It is clear in intent but fails to answer the question, making it largely incorrect, incomplete, and not useful."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 3.25,
      "brief_justification": "The response is truncated and does not implement the full requested system. What is shown is reasonably clear and mostly sensible, but key required pieces are missing: Aggregate snapshot methods are unfinished, no BankAccount commands/logic, no transfer workflow, no rebuild/save flow, and no demonstration of eventual consistency between projections."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 6.4,
      "brief_justification": "Well-structured and mostly plausible, but it does not fully meet the prompt: transfer is incomplete because no corresponding credit event is emitted/applied to the destination account, snapshot support is ad hoc rather than integrated into aggregate rehydration, and eventual consistency between two projections is not actually demonstrated. Core event store, aggregate replay, and basic projections are present and clearly explained."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 4.2,
      "brief_justification": "Covers most requested components, but the implementation is not minimally correct: dataclass event subclasses are incompatible with the Event base __init__, version handling is broken because apply_change does not increment version, snapshot restoration replays from a fresh aggregate instead of snapshot state, projection processing replays all events repeatedly, and the demo is truncated and does not properly demonstrate eventual consistency between two projections."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 3.5,
      "brief_justification": "The response is truncated mid-code and does not deliver a runnable minimal system. It outlines relevant components and some event/store logic correctly, but key required parts are missing or incomplete, including the aggregate base, full BankAccount commands, snapshot usage, projections, and the eventual consistency demonstration."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 6.15,
      "brief_justification": "Well-structured and mostly clear, but it misses the requested transfer command and the eventual consistency demo is only synchronous pub-sub rather than a true delayed projection example. There is also a significant correctness issue: save_snapshot uses asdict(aggregate) on a non-dataclass aggregate, which will fail. Concurrency/version checks are also absent, though acceptable for a minimal design."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "minimax_m25",
      "respondent_name": "MiniMax M2.5",
      "weighted_score": 1.85,
      "brief_justification": "The response is truncated and does not deliver the requested implementation. It only begins an Event base class and mentions intended components, but omits EventStore, Aggregate rebuilding, BankAccount commands, snapshot support, and the eventual consistency projection demo."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 2.05,
      "brief_justification": "The response is truncated and does not provide a runnable implementation. Core requested parts are missing or incomplete, including projections/eventual consistency demo. Snapshot handling is also flawed because snapshot events are appended to the event stream and can break optimistic concurrency/version semantics."
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 6.85,
      "brief_justification": "The implementation covers most of the requested components well: Event base class, EventStore, Aggregate base class, BankAccount aggregate with deposit/withdraw/transfer, snapshot support, and repository with concurrency control. The code is well-structured and clearly organized. However, the response is truncated - the BalanceProjection's handle method is cut off, and the second projection and the demonstration of eventual consistency between two projections are missing entirely. This is a sign"
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 0.85,
      "brief_justification": "The response promises a complete implementation but provides absolutely no code. It only contains an introductory description and a heading ('The Implementation') with nothing following it. No Event base class, no EventStore, no Aggregate, no BankAccount, no snapshot support, no projections, and no demonstration of eventual consistency. The response is essentially empty and fails to address any part of the question."
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 6.25,
      "brief_justification": "The response demonstrates strong architectural understanding and clear documentation, with well-designed Event, EventStore, Aggregate base classes, snapshot support, and BankAccount events. However, the code is truncated mid-implementation - the Aggregate base class is incomplete (to_snapshot_state is cut off), the BankAccount aggregate with deposit/withdraw/transfer commands is missing, snapshot restoration logic is incomplete, and the demonstration of eventual consistency between two projectio"
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 6.85,
      "brief_justification": "The implementation covers most requested components: Event base class, in-memory EventStore, Aggregate base class, BankAccount aggregate with commands, snapshot support, and two projections. However, there are notable issues: (1) The code is incomplete - it cuts off without the demonstration of eventual consistency between projections, which was explicitly requested. (2) The transfer command only records TransferSent on the source account but never creates TransferReceived on the destination - t"
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 5.4,
      "brief_justification": "The response is cut off mid-code, so the demonstration is incomplete. There are several correctness issues: (1) Using @dataclass with Event ABC creates __init__ conflicts - the dataclass-generated __init__ won't call Event.__init__ properly, so event_id/timestamp won't be set. (2) The snapshot restoration in get_aggregate is buggy - it replays only new events but creates a fresh aggregate instead of using the snapshot copy. (3) process_new_events replays ALL events every time rather than trackin"
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 5.85,
      "brief_justification": "The response is well-structured and clearly written, with good architectural design for an event sourcing system. However, the code is truncated/incomplete - it cuts off mid-method in the EventStore class, missing the snapshot loading/saving methods, the AggregateRoot base class, the BankAccount aggregate implementation, the projection classes, and the demonstration of eventual consistency. Since these are core requirements of the question, the response fails to deliver a complete, runnable solu"
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "minimax_m25",
      "respondent_name": "MiniMax M2.5",
      "weighted_score": 3.05,
      "brief_justification": "The response is incomplete - it cuts off mid-implementation with only the Event base class partially defined. The EventStore, Aggregate base class, BankAccount aggregate, snapshot support, and projection demonstrations are all missing. What exists is structurally reasonable but the response fails to deliver on the core requirements of the question."
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 6.55,
      "brief_justification": "The implementation covers most requested features but has notable correctness issues: (1) `asdict(aggregate)` won't work on BankAccount since it's not a dataclass, breaking snapshot save/load; (2) Transfer command between accounts is missing entirely despite being explicitly requested; (3) The 'eventual consistency' demonstration is actually synchronous - projections are updated immediately in the same call, so it doesn't truly demonstrate eventual consistency or any lag between projections. The"
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "minimax_m25",
      "respondent_name": "MiniMax M2.5",
      "weighted_score": 1.85,
      "brief_justification": "The response is severely incomplete - the code is cut off very early in the implementation. It only shows a partial Event base class and nothing else. The EventStore, Aggregate base class, BankAccount aggregate, snapshot support, and eventual consistency demonstration are all missing. While the outline and what little code exists shows reasonable structure and understanding, the response fails to deliver on virtually all requirements of the question."
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 5.25,
      "brief_justification": "The response is cut off mid-implementation - the Projection class and the demonstration of eventual consistency between two projections are missing. The core infrastructure (Event, EventStore, Aggregate) is reasonably correct, and the BankAccount aggregate has proper commands and apply methods. However, there are design issues: storing the computed balance in events rather than deriving it purely from apply handlers is questionable; the snapshot implementation has a bug where it saves a snapshot"
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 4.6,
      "brief_justification": "The response provides a solid foundation for an event sourcing system with good DDD patterns, but it cuts off abruptly, leaving the code syntactically invalid and missing the required projections and eventual consistency demonstration."
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 5.9,
      "brief_justification": "The response cuts off abruptly, missing the Aggregate base class, BankAccount aggregate, projections, and eventual consistency demonstration. The provided code is correct and clear, but highly incomplete."
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 6,
      "brief_justification": "The code provided is highly accurate, well-structured, and demonstrates a solid understanding of event sourcing primitives. However, the response is severely truncated and completely misses the requested BankAccount aggregate, commands, projections, and demonstration of eventual consistency."
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 6.65,
      "brief_justification": ""
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 5.55,
      "brief_justification": ""
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 5.45,
      "brief_justification": "The code provided is well-structured and accurate, but the response is cut off prematurely, missing the aggregate classes, commands, and projections requested."
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 6,
      "brief_justification": ""
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "minimax_m25",
      "respondent_name": "MiniMax M2.5",
      "weighted_score": 1.65,
      "brief_justification": "The response is severely truncated, providing only the imports and a partial Event base class. It fails to implement the EventStore, Aggregate, BankAccount, snapshots, and projections requested."
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 5.15,
      "brief_justification": "The response provides a well-structured foundation for event sourcing but is severely truncated, completely missing the projections and eventual consistency demonstration."
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 7.2,
      "brief_justification": "The implementation is well-structured and demonstrates solid event sourcing concepts including events, aggregate, repository, snapshot support, and projections. The code is clean and readable with good separation of concerns. However, the response is truncated - the BalanceProjection class is cut off mid-definition, and the demonstration of eventual consistency between two projections is missing entirely. The transfer mechanism is well thought out with idempotency via transfer_id. The snapshot_e"
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 8.8,
      "brief_justification": "The implementation is technically sound with proper immutable events using frozen dataclasses, optimistic concurrency control, thread safety with locks, snapshot support, and global event log for projections. The code is well-structured and documented. Minor deductions: the response appears truncated (subscribe method incomplete, BankAccount aggregate and projection demonstration not shown), and eventual consistency demonstration between two projections is missing from the visible portion. The d"
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 1.85,
      "brief_justification": "The response only contains a description/outline of what will be implemented but provides no actual code. The answer promises to show an implementation but delivers nothing concrete - no Event class, no EventStore, no Aggregate, no BankAccount, no snapshots, no projections. This is essentially an empty response that fails to answer the question."
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 8,
      "brief_justification": "The implementation is largely correct and well-structured with clear separation of concerns. It covers all required components: Event base class, EventStore with snapshot support, Aggregate base class with event replay, BankAccount with commands, and two projections. Minor issues: the version=0 passed in event constructors is immediately overwritten by _record(), which works but is slightly confusing; the 'eventual consistency' demonstration between projections is described but not actually show"
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 5.8,
      "brief_justification": "The implementation has several issues: the @dataclass decorator conflicts with the custom __init__ in Event base class (dataclass events won't call super().__init__), the snapshot restoration logic is flawed (it creates a new aggregate from only the post-snapshot events but doesn't properly restore from the snapshot object), process_new_events re-processes all events on every call causing duplicate projection updates, the transfer method has a bug where it saves the receiving account inside the "
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 8,
      "brief_justification": "The response is well-structured and demonstrates solid event sourcing concepts including Event base class, EventStore with snapshots, and AggregateRoot. The code is clean and readable. However, the response appears to be cut off before showing the complete implementation (BankAccount aggregate, projections, and eventual consistency demonstration are missing from the visible output), which significantly impacts completeness. The parts shown are correct and well-explained with good depth on concur"
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 7.75,
      "brief_justification": "The implementation is well-structured and covers most requirements: Event base class, EventStore, Aggregate with event replay, BankAccount with deposit/withdraw, snapshots, and projections. However, there are notable issues: (1) transfer between accounts is missing despite being explicitly requested, (2) the snapshot restoration is flawed - `asdict(aggregate)` won't work correctly since Aggregate isn't a dataclass, and restoring via setattr would overwrite balance with 0 from the snapshot data s"
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 7,
      "brief_justification": "The implementation is well-structured and mostly correct, covering Event, EventStore, Aggregate, BankAccount, and snapshot support. However, the response is cut off before completing the Projection class and the demonstration of eventual consistency, which were explicitly required. The snapshot-as-event approach has a concurrency version tracking bug (saving snapshot event after marking changes committed creates version mismatch). The transfer command only debits without crediting the target acc"
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 6.4,
      "brief_justification": "The code implements a solid core of an event-sourced system with correct patterns for aggregates, event store, snapshots, and projections. However, the response is incomplete: the BalanceProjection.handle method is cut off mid-definition, and the final demonstration of eventual consistency between two projections is entirely missing."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 8.4,
      "brief_justification": "Implementation is mostly correct and complete with good structure, optimistic concurrency, snapshots, and projections. Minor issues: current_version calculation is off-by-one, from_dict doesn't handle subclasses properly, enrich_event has serialization quirks, and demo code is truncated."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 3.25,
      "brief_justification": "The response claims to provide a full implementation but contains no actual code. It only describes what it will include without delivering any classes, methods, or demonstration. This fails to implement the requested system."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 7.9,
      "brief_justification": "The implementation is mostly correct and well-structured with good explanations. It includes all requested components except the full BankAccount aggregate (missing command methods, state, apply handlers, and snapshot implementation) and the two projections for eventual consistency demo. The code is clear, demonstrates deep ES knowledge, but is incomplete as a working minimal system."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 6,
      "brief_justification": "Implements most requested components but has several bugs (incomplete demo, snapshot logic errors, transfer handling issues, missing event_id/aggregate_id in some events, incorrect replay in get_aggregate). Shows basic structure and eventual consistency concept but not fully correct or complete."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 8.6,
      "brief_justification": "The implementation is factually correct, logically sound, and covers all requested components (Event base, EventStore with snapshots, Aggregate base, BankAccount with deposit/withdraw/transfer, projections for eventual consistency). Code is clear, well-structured, and highly practical. Minor deductions for incomplete response (code appears truncated) and lack of full Aggregate base + BankAccount class definitions in the provided snippet, though the overall design is excellent."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 6.6,
      "brief_justification": "The implementation covers most requested components but has several bugs: incorrect event instantiation in apply_new (missing timestamp), broken snapshot restoration (asdict on Aggregate doesn't capture state properly), version handling issues, and missing transfer command. Projections demonstrate eventual consistency via subscriptions. Code is mostly clear but contains technical flaws that prevent it from being fully correct."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "minimax_m25",
      "respondent_name": "MiniMax M2.5",
      "weighted_score": 3.6,
      "brief_justification": "The response is incomplete: it only implements the base Event class and then cuts off abruptly. None of the required components (EventStore, Aggregate base, BankAccount with commands, snapshots, or projections demonstrating eventual consistency) are present. The partial code shown has some basic structure but is not functional or correct for the full request."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 6.2,
      "brief_justification": "Implementation has several bugs (version handling, snapshot as event, incomplete Projection class, missing eventual consistency demo) but shows good structure and covers most requested components."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 8.6,
      "brief_justification": "Well-structured implementation with good coverage of event sourcing concepts, though missing the demonstration of eventual consistency between projections."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 9.6,
      "brief_justification": "Comprehensive implementation covering all requested features with clear structure and practical utility."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 8.05,
      "brief_justification": "Accurate and well-structured with practical value, though depth could be slightly deeper."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 8.6,
      "brief_justification": "Comprehensive and well-structured implementation with good depth and practical value."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 9.45,
      "brief_justification": "Comprehensive and accurate implementation with all requested features."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 9.45,
      "brief_justification": "Comprehensive and accurate implementation with clear structure and practical value."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 9.6,
      "brief_justification": "Comprehensive and accurate implementation with clear explanations and practical demonstration."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "minimax_m25",
      "respondent_name": "MiniMax M2.5",
      "weighted_score": 7.8,
      "brief_justification": "Clear and accurate with good depth."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 9.6,
      "brief_justification": "Comprehensive and accurate implementation with clear structure and practical value."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 4.8,
      "brief_justification": "Code is well-structured and shows key concepts, but it is incomplete and cannot be executed as provided, limiting its correctness and practical usefulness."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 4.1,
      "brief_justification": "The answer includes a solid Event base class and an in-memory EventStore with snapshot handling, but it stops short of providing an Aggregate base class, the BankAccount aggregate with its commands, or a demonstration of eventual consistency between projections, making it incomplete and of limited practical utility."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 2.35,
      "brief_justification": "Response lacks the actual implementation code, making it incomplete and of limited usefulness despite a clear description."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 4.95,
      "brief_justification": "The code correctly defines core primitives (Event, EventStore, Snapshot, part of Aggregate) with clear structure, but it stops before implementing the BankAccount aggregate, command methods, snapshot handling, and projection demonstration, making it incomplete and of limited practical use."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 7.15,
      "brief_justification": ""
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 4.9,
      "brief_justification": "The answer includes most required components and is well‑structured, but the implementation has critical bugs (dataclass event constructors, incomplete demo, snapshot not demonstrated) that prevent it from working correctly, limiting its practical usefulness."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "minimax_m25",
      "respondent_name": "MiniMax M2.5",
      "weighted_score": 3.2,
      "brief_justification": "The answer starts with a clear description and a partial Event class, but it stops short of providing the required EventStore, Aggregate, BankAccount logic, snapshot handling, and projection demonstration, making it incomplete and of limited practical value."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 5.55,
      "brief_justification": "Provides core event sourcing components and a BankAccount aggregate with snapshot handling, but the projection implementation is cut off and no demonstration of eventual consistency is shown. Transfer logic is simplistic and the code is not fully runnable, limiting correctness and completeness despite decent clarity and moderate depth."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 8.45,
      "brief_justification": "The implementation is technically sound, featuring a robust EventStore with concurrency checks and a well-structured Aggregate base. It includes snapshot logic and the requested BankAccount logic. However, the code is truncated at the very end, missing the completion of the BalanceProjection class and the demonstration script."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 9.6,
      "brief_justification": "The implementation is exceptionally high quality. It correctly handles optimistic concurrency, snapshotting, and eventual consistency via a background projection thread. The code is idiomatic, well-documented, and provides a complete working example of the requested domain logic."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 2.25,
      "brief_justification": "The model provided an introductory description but failed to include any actual code implementation, which was the primary requirement of the prompt."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 9.6,
      "brief_justification": "The implementation is exceptionally high quality. It correctly implements optimistic concurrency, snapshotting logic, and the event replay pattern. The code is idiomatic, well-documented, and addresses all parts of the prompt including eventual consistency via a subscription model."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 9,
      "brief_justification": "The implementation is robust and follows DDD/Event Sourcing patterns closely. It includes snapshotting and projections as requested. While the 'eventual consistency' demonstration is implied through the separate projection classes rather than an asynchronous bus, the architectural separation is clear."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 8.55,
      "brief_justification": "The implementation is solid and covers all requirements. It correctly implements the Aggregate pattern, Event Store, and Snapshotting. The demonstration of eventual consistency via projections is clear. Note: The code snippet ends abruptly at the very end, but the logic is fully present."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 8.6,
      "brief_justification": "The implementation is robust and follows DDD/Event Sourcing patterns correctly. It includes versioning for concurrency, a clean separation of concerns, and a working demonstration of eventual consistency. It was slightly cut off at the very end of the code block, but the core logic is fully present."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "minimax_m25",
      "respondent_name": "MiniMax M2.5",
      "weighted_score": 5.05,
      "brief_justification": "The response is incomplete. It provides the initial setup and part of the Event class but cuts off before implementing the EventStore, Aggregate, BankAccount, Snapshots, or Projections requested in the prompt."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 8.6,
      "brief_justification": "The implementation is technically sound and follows event sourcing patterns correctly. It includes optimistic concurrency, snapshotting, and a clear separation of concerns. The response was cut off at the very end (Projection class), but the core logic for the requested features is present and high quality."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 6.1,
      "brief_justification": "Code is mostly correct and shows good understanding of event sourcing concepts, but the response is incomplete - cuts off mid-class in the Projection section and lacks the requested demonstration of eventual consistency between two projections."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 6.15,
      "brief_justification": "The code shown is well-structured and correct, but the response is truncated - missing the Aggregate base class, BankAccount aggregate implementation, and eventual consistency demonstration. The partial implementation shows good depth with optimistic concurrency, snapshots, and thread safety, but is incomplete."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 1,
      "brief_justification": "Response only contains an introductory paragraph and a heading promising implementation, but no actual code is provided. The response is essentially empty."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 5.6,
      "brief_justification": "Well-structured event sourcing infrastructure with proper concurrency control and snapshots, but response is truncated mid-implementation - missing BankAccount aggregate, command handlers, and the critical demonstration of eventual consistency between projections."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 7.6,
      "brief_justification": "Well-structured implementation with correct core patterns. Minor issues: polymorphic event deserialization is incomplete, transfer command doesn't create corresponding TransferReceived event, and lacks a working demo of eventual consistency between projections."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 7.35,
      "brief_justification": "Complete implementation with Event, Aggregate, EventStore, BankAccount, and snapshots. However, has correctness issues: FundsTransferred only debits sender but doesn't credit recipient in the projection, process_new_events reprocesses all events instead of new ones, snapshot restoration has bugs, and demo code is truncated. Good clarity and depth overall."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 7.9,
      "brief_justification": "Well-structured, correct implementation of core event sourcing patterns with proper type hints, threading, and serialization. However, the response appears truncated - it ends mid-sentence with 'Convenient' and lacks the demonstration of eventual consistency between the two projections that was requested."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 7.55,
      "brief_justification": "Well-structured implementation with clear separation of concerns. Includes Event base class, EventStore, Aggregate with state rebuild, BankAccount with deposit/withdraw, and snapshot support. Minor issues: transfer command is missing (only deposit/withdraw provided), and there are some implementation quirks like dataclass_replace defined after use. Demonstrates eventual consistency well through projections."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 5.75,
      "brief_justification": "Well-structured code with correct implementation of core ES components (Event, EventStore, Aggregate, BankAccount, snapshots). However, response is incomplete - ends abruptly mid-code at Projection class, missing the demonstration of eventual consistency between projections promised in the introduction."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 8.6,
      "brief_justification": "Comprehensive implementation with correct event sourcing patterns, snapshot support, and projections. Minor incompleteness in projection demonstration."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 8.8,
      "brief_justification": "Comprehensive implementation covering all requested components with clear structure and practical examples, though snapshot integration could be more explicit."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 8.8,
      "brief_justification": "The response provides a complete, working Python implementation covering all requested components (Event, Aggregate, EventStore, BankAccount, Snapshots, and Projections). It correctly demonstrates eventual consistency using threading and an event bus. The code is clean, well-structured, and highly actionable."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 8.6,
      "brief_justification": "Comprehensive implementation with correct event sourcing patterns, clear structure, and practical features like snapshots and projections. Minor omission in the provided code snippet (Aggregate class incomplete)."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 8.35,
      "brief_justification": "Solid implementation with correct core concepts. Missing the actual demonstration of eventual consistency between projections (no event bus/dispatcher shown)."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 8,
      "brief_justification": "Solid implementation with correct patterns, though transfer logic has minor inconsistency and demo is slightly incomplete."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 8.6,
      "brief_justification": "Strong implementation of core concepts with good structure. Missing a critical concurrency check in the aggregate's event application logic, but otherwise robust and demonstrative."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 9,
      "brief_justification": "Excellent implementation covering all requested components. The code is clean, correct, and demonstrates the concepts well. Minor deduction in depth for not explicitly handling the 'transfer' command as requested, though the logic is easily derivable."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "minimax_m25",
      "respondent_name": "MiniMax M2.5",
      "weighted_score": 5.8,
      "brief_justification": "Implementation is incomplete and contains logical errors (e.g., Event base class lacks proper type handling, snapshot support is missing, eventual consistency demonstration is absent)."
    }
  ],
  "meta": {
    "source": "The Multivac (app.themultivac.com)",
    "methodology": "10x10 blind peer matrix evaluation",
    "criteria": "correctness, completeness, clarity, depth, usefulness",
    "self_judgments": "excluded from rankings",
    "license": "Open data — cite as: The Multivac (2026)"
  }
}