{
  "evaluation_id": "EVAL-20260402-132144",
  "question_id": "CODE-014",
  "question_text": "Implement a Last-Writer-Wins Element Set (LWW-Element-Set) CRDT in Python. It should support add, remove, lookup, and merge operations. Include a proof that merge is commutative, associative, and idempotent. Write tests demonstrating conflict resolution between two divergent replicas.",
  "category": "code",
  "timestamp": "2026-04-02T13:21:44.320Z",
  "display_date": "Apr 02, 2026",
  "winner": null,
  "avg_score": 7.68,
  "matrix_size": 38,
  "models_used": [
    {
      "id": "deepseek_v4",
      "name": "DeepSeek V4",
      "provider": "openrouter"
    },
    {
      "id": "gpt_oss_120b",
      "name": "GPT-OSS-120B",
      "provider": "OpenAI"
    },
    {
      "id": "gemini_3_flash",
      "name": "Gemini 3 Flash Preview",
      "provider": "Google"
    },
    {
      "id": "claude_sonnet_46",
      "name": "Claude Sonnet 4.6",
      "provider": "openrouter"
    },
    {
      "id": "mimo_v2_flash",
      "name": "MiMo-V2-Flash",
      "provider": "Xiaomi"
    },
    {
      "id": "gpt_5_4",
      "name": "GPT-5.4",
      "provider": "openrouter"
    },
    {
      "id": "claude_opus_46",
      "name": "Claude Opus 4.6",
      "provider": "openrouter"
    },
    {
      "id": "gemini_31_pro",
      "name": "Gemini 3.1 Pro",
      "provider": "openrouter"
    }
  ],
  "rankings": {
    "gemini_3_flash": {
      "display_name": "Gemini 3 Flash Preview",
      "provider": "Google",
      "average_score": 8.98,
      "score_count": 6,
      "min_score": 8.45,
      "max_score": 9.8,
      "rank": 1
    },
    "gpt_5_4": {
      "display_name": "GPT-5.4",
      "provider": "openrouter",
      "average_score": 8.75,
      "score_count": 5,
      "min_score": 8.25,
      "max_score": 9.45,
      "rank": 2
    },
    "deepseek_v4": {
      "display_name": "DeepSeek V4",
      "provider": "openrouter",
      "average_score": 8.49,
      "score_count": 4,
      "min_score": 7.4,
      "max_score": 8.95,
      "rank": 3
    },
    "gpt_oss_120b": {
      "display_name": "GPT-OSS-120B",
      "provider": "OpenAI",
      "average_score": 8.17,
      "score_count": 5,
      "min_score": 6.75,
      "max_score": 9.6,
      "rank": 4
    },
    "mimo_v2_flash": {
      "display_name": "MiMo-V2-Flash",
      "provider": "Xiaomi",
      "average_score": 8,
      "score_count": 5,
      "min_score": 7.65,
      "max_score": 8.8,
      "rank": 5
    },
    "claude_sonnet_46": {
      "display_name": "Claude Sonnet 4.6",
      "provider": "openrouter",
      "average_score": 7.48,
      "score_count": 5,
      "min_score": 6,
      "max_score": 8.6,
      "rank": 6
    },
    "claude_opus_46": {
      "display_name": "Claude Opus 4.6",
      "provider": "openrouter",
      "average_score": 7.39,
      "score_count": 4,
      "min_score": 4.65,
      "max_score": 8.8,
      "rank": 7
    },
    "gemini_31_pro": {
      "display_name": "Gemini 3.1 Pro",
      "provider": "openrouter",
      "average_score": 4.19,
      "score_count": 4,
      "min_score": 1.65,
      "max_score": 7.6,
      "rank": 8
    }
  },
  "judgments": [
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 8.8,
      "brief_justification": "Implementation is accurate and complete with proofs and tests; clarity and structure are excellent; depth is good but could include more edge cases; usefulness is high for practical CRDT understanding."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 8.6,
      "brief_justification": "Implementation is correct and complete with clear structure. Proof is logical but incomplete as it cuts off mid-sentence. Tests are missing, reducing completeness and usefulness slightly."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 8.6,
      "brief_justification": "Accurate implementation, thorough proof, and clear tests with conflict resolution examples."
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 6,
      "brief_justification": "The response provides a correct and well-structured implementation of the CRDT, but it cuts off mid-way through the mathematical proof and completely omits the requested tests."
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 8.95,
      "brief_justification": ""
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 6.75,
      "brief_justification": "The response provides a correct Python implementation and begins a solid mathematical proof, but cuts off abruptly, leaving the proof incomplete and entirely omitting the requested tests for conflict resolution."
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 9.8,
      "brief_justification": "The response provides a flawless implementation of an LWW-Element-Set CRDT, a concise and mathematically sound proof of its properties, and comprehensive tests demonstrating conflict resolution."
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 7.9,
      "brief_justification": "The implementation and proofs are highly accurate and well-explained, but the response is incomplete as it cuts off during the test cases."
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 8.65,
      "brief_justification": "The implementation is technically correct and well-structured, with proper LWW semantics, configurable bias, and clean merge logic. The proof sketch is rigorous and well-presented. The main deduction is that the response appears to be cut off mid-test, leaving the test suite incomplete. The core implementation and proofs are excellent, but the truncated tests reduce completeness and usefulness scores slightly."
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 7.65,
      "brief_justification": "The implementation is well-structured and correct for the core LWW-Element-Set CRDT with good documentation. However, the response is cut off mid-implementation (merge_in_place is incomplete), missing the static helper method, the proof of CRDT properties, and the conflict resolution tests that were explicitly requested. The parts that are present show good depth and clarity, but the incompleteness significantly impacts the score."
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 5.1,
      "brief_justification": "The response starts well with a clear implementation structure and correct conceptual explanation of LWW-Element-Set. However, the response is truncated mid-implementation, cutting off during the remove method. The promised proof of commutativity, associativity, and idempotency is missing, as are the merge operation, lookup completion, and conflict resolution tests. What exists is correct but the response is severely incomplete."
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 8.8,
      "brief_justification": "The implementation is correct and well-structured with proper LWW semantics. The proof covers commutativity, associativity, and idempotency with clear mathematical notation. The code is clean with good typing and documentation. Minor gaps: the proof appears truncated (associativity proof cuts off mid-sentence, idempotency proof not shown in the excerpt), and tests for conflict resolution between divergent replicas were mentioned but not shown in the provided response. These omissions reduce comp"
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 1.65,
      "brief_justification": "The response is incomplete, cutting off mid-sentence in the remove method. It lacks the merge operation, the proof of mathematical properties, and the requested tests. The provided code snippet is syntactically broken."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 7.6,
      "brief_justification": "Implementation is correct and clear, but proof is incomplete and tests are missing."
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 8.8,
      "brief_justification": "The implementation is correct and well-structured, covering all required operations (add, remove, lookup, merge). The proof of commutativity, associativity, and idempotency is mathematically sound and clearly explained. Tests cover key conflict resolution scenarios including tie-breaking, late arrivals, and divergent replicas. Minor gap: the proof could more explicitly address the combined (A,R) pair rather than just individual maps, and the merge method mutates in-place rather than returning a "
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 8,
      "brief_justification": "The implementation is largely correct with proper LWW semantics using max timestamps for merge. The proof is mathematically sound and well-structured. The code is clean and readable. However, the response is cut off mid-test (test_concurrent_add_remove_conflict is incomplete), which is a significant completeness issue. The bias toward add-wins vs remove-wins in ties (strictly greater than) is correctly noted. Missing some edge case tests and the truncation prevents full evaluation of the test su"
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 8.6,
      "brief_justification": "Implementation is correct, clear, and complete with proper CRDT semantics, mathematical proof, and tests. The response is cut off at the end of the test class, slightly reducing completeness."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 8.45,
      "brief_justification": "Implementation is correct, clear, and mostly complete with good CRDT properties explanation and tests implied. Minor incompleteness in the provided code snippet (merge_in_place is cut off) and missing explicit proof/tests in the response, though structure is strong."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 8.6,
      "brief_justification": "Implementation is correct and well-structured with proper bias handling and tests implied. Proof is started but cut off, reducing completeness slightly. Code is clear, idiomatic, and useful for CRDT understanding."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 7.4,
      "brief_justification": "Implementation is mostly correct and clear with good tests, but has a bug in remove() (should not require element in add_set) and merge() incorrectly uses 0 as default timestamp (should use -inf or handle missing keys properly). Proofs are high-level but valid. Completeness is strong with all requested features."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 8.6,
      "brief_justification": "Implementation is correct and matches standard LWW-Element-Set CRDT. Proof is mathematically sound. Tests cover key conflict scenarios including ties and late adds. Minor incompleteness: merge is not symmetric (only merges other into self) and class could benefit from a merge that returns new instance, but overall excellent."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 7.65,
      "brief_justification": "Implementation is mostly correct with proper LWW logic and good proof of CRDT properties. Merge is correctly implemented via per-element max timestamps. Tests are useful but incomplete (final test is truncated). Minor issues: uses wall-clock time (not ideal for real CRDTs), get_operations is defined but unused in merge, and remove before add handling is present but not fully tested."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 9.45,
      "brief_justification": "Comprehensive implementation with proofs and tests, well-structured and clear."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 8.8,
      "brief_justification": "Well-structured implementation with thorough coverage of LWW-Element-Set CRDT properties and operations. Includes merge proofs and bias handling."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 7.6,
      "brief_justification": "Accurate implementation with clear structure, but missing merge operation proof and tests."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 8.6,
      "brief_justification": "The implementation is accurate and thorough, with clear documentation and a good proof of properties. The code is well-structured and practical for real-world use."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 9.6,
      "brief_justification": "The response provides a thorough and accurate implementation of the LWW-Element-Set CRDT, including detailed proofs for merge properties. The clarity is high, though some parts could be slightly more concise. The depth and usefulness are excellent, offering practical value and insightful analysis."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 9.6,
      "brief_justification": "Accurate implementation with thorough proofs and tests."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 8.8,
      "brief_justification": "Comprehensive implementation with detailed proof and tests, demonstrating deep understanding of CRDT properties."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 8.25,
      "brief_justification": "Implementation and proof are correct and clearly presented; however, the test suite is incomplete, missing full conflict resolution examples, reducing overall completeness and usefulness."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 4.65,
      "brief_justification": "The response provides a solid core implementation and clear explanations, but the code is incomplete (truncated merge_in_place), lacks the required formal proof of merge properties, and does not include the requested tests, reducing overall completeness and practical usefulness."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 2.4,
      "brief_justification": "Response is truncated and missing key functionalities (lookup, merge, proof, tests). The provided portion is clear but incomplete and not functional."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 6.6,
      "brief_justification": "Implementation is correct and well-documented, but the answer omits required test cases and the proof is incomplete, reducing completeness and depth."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 8.8,
      "brief_justification": "Accurate implementation with clear code, full coverage of required operations, solid proofs of CRDT properties, and demonstrative tests; minor depth limitation regarding tie-breaking details."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 7.1,
      "brief_justification": "Implementation is correct and well-documented. However, the response appears truncated - the proof section ends mid-sentence (associativity proof incomplete), no idempotency proof is provided, and critically no tests demonstrating conflict resolution are included. The response covers about 60-70% of what was requested."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 8.45,
      "brief_justification": "Well-structured implementation with correct LWW-Element-Set logic, comprehensive proof of CRDT properties using max operation, and good conflict resolution tests. Minor improvements could include returning self from merge for chaining and more edge case tests."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 7.65,
      "brief_justification": "Well-structured implementation with proper LWW semantics and a solid mathematical proof of convergence properties. The response is cut off mid-test, losing completeness points, but the core implementation and proofs are strong and insightful."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 8.8,
      "brief_justification": "Implementation is correct and well-structured. Proof is rigorous. Tests are incomplete (truncated), preventing a perfect score."
    }
  ],
  "meta": {
    "source": "The Multivac (app.themultivac.com)",
    "methodology": "10x10 blind peer matrix evaluation",
    "criteria": "correctness, completeness, clarity, depth, usefulness",
    "self_judgments": "excluded from rankings",
    "license": "Open data — cite as: The Multivac (2026)"
  }
}