{
  "evaluation_id": "EVAL-20260402-144035",
  "question_id": "CODE-025",
  "question_text": "Implement an LRU cache with per-key TTL (time-to-live) support. Requirements: O(1) get/put, thread-safe, lazy expiration (don't use background threads), configurable max size, eviction callback, and cache hit/miss statistics. Include comprehensive tests.",
  "category": "code",
  "timestamp": "2026-04-02T14:40:35.802Z",
  "display_date": "Apr 02, 2026",
  "winner": {
    "name": "Gemini 3 Flash Preview",
    "provider": "Google",
    "score": 8.34
  },
  "avg_score": 6.4322224,
  "matrix_size": 78,
  "models_used": [
    {
      "id": "gemini_3_flash",
      "name": "Gemini 3 Flash Preview",
      "provider": "Google"
    },
    {
      "id": "mimo_v2_flash",
      "name": "MiMo-V2-Flash",
      "provider": "Xiaomi"
    },
    {
      "id": "gpt_5_4",
      "name": "GPT-5.4",
      "provider": "openrouter"
    },
    {
      "id": "claude_opus_46",
      "name": "Claude Opus 4.6",
      "provider": "openrouter"
    },
    {
      "id": "claude_sonnet_46",
      "name": "Claude Sonnet 4.6",
      "provider": "openrouter"
    },
    {
      "id": "grok_420",
      "name": "Grok 4.20",
      "provider": "openrouter"
    },
    {
      "id": "deepseek_v4",
      "name": "DeepSeek V4",
      "provider": "openrouter"
    },
    {
      "id": "gpt_oss_120b",
      "name": "GPT-OSS-120B",
      "provider": "OpenAI"
    },
    {
      "id": "gemini_31_pro",
      "name": "Gemini 3.1 Pro",
      "provider": "openrouter"
    }
  ],
  "rankings": {
    "gemini_3_flash": {
      "display_name": "Gemini 3 Flash Preview",
      "provider": "Google",
      "average_score": 8.34,
      "score_count": 9,
      "min_score": 6.75,
      "max_score": 9.6,
      "rank": 1
    },
    "gpt_5_4": {
      "display_name": "GPT-5.4",
      "provider": "openrouter",
      "average_score": 7.9,
      "score_count": 9,
      "min_score": 5.5,
      "max_score": 9.6,
      "rank": 2
    },
    "mimo_v2_flash": {
      "display_name": "MiMo-V2-Flash",
      "provider": "Xiaomi",
      "average_score": 6.92,
      "score_count": 9,
      "min_score": 4.35,
      "max_score": 9.6,
      "rank": 3
    },
    "grok_420": {
      "display_name": "Grok 4.20",
      "provider": "openrouter",
      "average_score": 6.86,
      "score_count": 9,
      "min_score": 4.4,
      "max_score": 9.2,
      "rank": 4
    },
    "deepseek_v4": {
      "display_name": "DeepSeek V4",
      "provider": "openrouter",
      "average_score": 6.56,
      "score_count": 8,
      "min_score": 4.35,
      "max_score": 7.95,
      "rank": 5
    },
    "claude_opus_46": {
      "display_name": "Claude Opus 4.6",
      "provider": "openrouter",
      "average_score": 6.42,
      "score_count": 9,
      "min_score": 3.5,
      "max_score": 9,
      "rank": 6
    },
    "claude_sonnet_46": {
      "display_name": "Claude Sonnet 4.6",
      "provider": "openrouter",
      "average_score": 6.13,
      "score_count": 9,
      "min_score": 3.05,
      "max_score": 8.8,
      "rank": 7
    },
    "gpt_oss_120b": {
      "display_name": "GPT-OSS-120B",
      "provider": "OpenAI",
      "average_score": 6.04,
      "score_count": 9,
      "min_score": 2.05,
      "max_score": 9.2,
      "rank": 8
    },
    "gemini_31_pro": {
      "display_name": "Gemini 3.1 Pro",
      "provider": "openrouter",
      "average_score": 2.72,
      "score_count": 7,
      "min_score": 1.2,
      "max_score": 4.3,
      "rank": 9
    }
  },
  "judgments": [
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 9.6,
      "brief_justification": "Accurate implementation with thorough coverage, clear explanation, and practical tests."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 9.6,
      "brief_justification": "Comprehensive and correct implementation with detailed design decisions and thorough tests."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 7.65,
      "brief_justification": "The implementation meets core requirements with clear, well-structured code and detailed features, but the test suite is incomplete, limiting completeness and practical usefulness."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 8.6,
      "brief_justification": "Implementation is accurate, well-structured, and covers all requirements with good depth and practical utility."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 7.6,
      "brief_justification": "Implementation is mostly correct and well-structured, but incomplete (missing tests and some methods)."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 8.15,
      "brief_justification": "Implementation is largely correct and thread-safe with lazy expiration, but has a potential race condition in get() method and missing test cases for concurrency and edge cases."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 7.75,
      "brief_justification": "Implementation is mostly correct and well-structured, but has a minor bug in TTL handling and incomplete test code."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 8.6,
      "brief_justification": "Implementation is accurate and meets all requirements, with clear structure and thorough tests. Minor incompleteness in the provided code snippet (cut off), but overall design is sound."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 3.5,
      "brief_justification": "The response starts with a plausible design but is truncated mid-method, so the implementation and tests are incomplete. What is present is reasonably clear and mostly sound, but it does not satisfy the request as delivered."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 1.65,
      "brief_justification": "The response is truncated and does not provide the actual implementation or tests. It mentions appropriate concepts like hashmap+doubly linked list and locking, but fails to satisfy the request for a complete, thread-safe TTL LRU cache with callback, stats, and comprehensive tests."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 3.05,
      "brief_justification": "The response is truncated mid-implementation, so it is not runnable and omits required tests. What is present is reasonably structured and conceptually aligned with O(1) LRU+TTL using OrderedDict and locks, but missing large portions prevent full correctness and usefulness."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 4.4,
      "brief_justification": "Readable and partly well-structured, but it does not actually satisfy O(1) get/put due to LinkedHashSet updates under a read lock and size()/cleanExpired() scanning all entries. Thread-safety is flawed because get() mutates accessOrder while only holding the read lock. clear() skips eviction callbacks/stat semantics, and evictions count includes explicit removals. Tests are truncated/incomplete, so the response fails the 'comprehensive tests' requirement and overall is only partially usable."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 8.8,
      "brief_justification": "Well-implemented LRU cache with TTL, thread safety, and statistics. Minor room for improvement in edge cases."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 4.35,
      "brief_justification": "Readable and mostly structured, but it does not meet the O(1) requirement because lazy expiration scans all TTL entries, the tests are incomplete/truncated, callback semantics are questionable for updates/expiration, and returning None makes cached None indistinguishable from a miss."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 2.05,
      "brief_justification": "The response is incomplete and cuts off mid-implementation, so it does not deliver a working cache or any tests. What is present is mostly clear and conceptually reasonable, but key requirements are unmet in the provided answer."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 6.75,
      "brief_justification": "Clear and mostly correct, but it misses some edge cases and weakens the O(1) and TTL requirements in practice. Expired items are only removed on get, so put may evict a live LRU while expired entries still occupy capacity. TTL handling treats ttl_seconds=0 as no expiry, and tests are not fully comprehensive for concurrency, callback behavior on TTL eviction, updates, zero/None TTL, or lazy expiration during put."
    },
    {
      "judge_key": "gpt_5_4",
      "judge_name": "GPT-5.4",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 4.35,
      "brief_justification": "Reasonably clear design and partial implementation, but it does not fully meet the O(1) requirement due to heap usage, has stale-entry/updated-key expiration correctness issues, __len__ is not thread-safe, callback semantics are underspecified, and the test suite is incomplete/truncated."
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 6.5,
      "brief_justification": "The implementation is well-structured with correct O(1) get/put using HashMap + doubly linked list, proper thread safety via ReentrantLock, lazy expiration, configurable max size, eviction callbacks with reason enum, and hit/miss/eviction statistics. The Clock abstraction for testing is a nice design choice. However, the response is critically incomplete - the test file is cut off after the first test method declaration, so the 'comprehensive tests' requirement is not met. There's also a subtle "
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 1.2,
      "brief_justification": "The response is severely truncated and incomplete. It only provides a brief introduction mentioning the data structures and threading approach but contains no actual implementation code, no tests, and no meaningful content. It fails to deliver on virtually every requirement: no LRU cache implementation, no TTL support, no eviction callback, no statistics, and no tests."
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 5.5,
      "brief_justification": "The implementation is well-structured and demonstrates good design with proper use of OrderedDict for O(1) operations, threading.RLock for thread safety, lazy expiration, eviction callbacks, and cache statistics. However, the response is critically incomplete - the code is truncated mid-line in the `keys()` method, and the promised 'comprehensive tests' are entirely missing. The sentinel pattern using `...` for default TTL is clever but unconventional. The architecture and what's visible is corr"
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 5.6,
      "brief_justification": "The implementation has several significant issues: (1) Thread-safety bug: the get() method modifies accessOrder (a LinkedHashSet) while only holding a read lock, which is not safe since multiple readers could concurrently modify it. (2) The lock upgrade pattern (read->write) in get() and containsKey() is problematic - after releasing the read lock and acquiring the write lock, the state may have changed, requiring re-validation. (3) LinkedHashSet doesn't provide true O(1) LRU ordering - remove i"
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 6,
      "brief_justification": "The implementation has several correctness issues: (1) _cleanup() scans all expiry times on every get/put, making it O(n) not O(1); (2) the get() method calls _cleanup() which may already expire the key, then checks expiration again redundantly, potentially double-counting expired stats; (3) _remove_key calls eviction_callback even for non-eviction removals (like updating an existing key via put), conflating updates with evictions; (4) storing None values is indistinguishable from cache misses s"
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 4.9,
      "brief_justification": "The response provides a solid design overview and begins a well-structured implementation with correct O(1) linked-list operations, thread safety via RLock, and lazy expiration. However, the code is truncated mid-function (the `put` method is incomplete), and the promised comprehensive test suite is entirely missing. Key functionality like eviction in `put`, `delete`, `stats()`, and the full eviction callback logic are absent. While the design table and what exists of the code are clear and corr"
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 7.8,
      "brief_justification": "The implementation is correct and well-structured with O(1) get/put, thread safety via RLock, lazy expiration, eviction callbacks, and stats tracking. However, there are some gaps: (1) expired items still occupy capacity - put() doesn't check for expired entries when evicting, potentially evicting valid LRU entries while expired ones remain; (2) tests are decent but not truly 'comprehensive' - missing edge cases like zero capacity, updating existing keys, TTL on eviction callback for expired ite"
    },
    {
      "judge_key": "claude_opus_46",
      "judge_name": "Claude Opus 4.6",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 6.5,
      "brief_justification": "The implementation is mostly correct with good design decisions (doubly linked list, heap for TTL, RLock for thread safety, lazy expiration). However, there are issues: (1) the response is truncated - the test suite is cut off mid-test, missing comprehensive tests which were a core requirement; (2) the eviction callback is not triggered on TTL expiration during get() - only during _evict_one(); (3) the heap can grow unboundedly with updates since old entries are never cleaned; (4) the put() meth"
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 5.5,
      "brief_justification": ""
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 5.75,
      "brief_justification": "The code demonstrates excellent architectural choices and thread-safety considerations, but the response is severely truncated, missing the core get/put methods and all requested tests."
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 6.5,
      "brief_justification": "The implementation is well-designed, thread-safe, and meets most requirements with excellent clarity and depth. However, the response is severely truncated, resulting in incomplete code and completely missing the requested comprehensive tests."
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 4.55,
      "brief_justification": ""
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 6.15,
      "brief_justification": ""
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 5.15,
      "brief_justification": "The response starts with an excellent design and well-structured code, but it cuts off mid-implementation. It is missing the rest of the cache methods and the requested comprehensive tests."
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 9,
      "brief_justification": ""
    },
    {
      "judge_key": "gemini_31_pro",
      "judge_name": "Gemini 3.1 Pro",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 6.2,
      "brief_justification": ""
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 9.2,
      "brief_justification": "Highly accurate implementation meeting all requirements, thorough test coverage, clear structure, insightful design choices, and practical utility."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 9.2,
      "brief_justification": "Highly detailed and accurate implementation with thorough coverage of requirements."
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 8.2,
      "brief_justification": "The implementation is well-structured with correct O(1) get/put using HashMap + doubly linked list, proper ReentrantLock thread safety, lazy expiration, eviction callbacks with reasons, and statistics. The FakeClock for testing is excellent. Minor issues: eviction listener for REPLACED reason is fired outside the lock (potential ordering issue), and the test file appears truncated. The design choices like EvictionReason enum and Stats class add good depth. Overall a solid, production-quality imp"
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 6.65,
      "brief_justification": "The implementation shows strong design with correct O(1) operations via OrderedDict, proper thread safety with RLock, good separation of concerns, and well-documented code. However, the response is truncated - it cuts off mid-sentence in the get() method docstring, missing the complete get/put implementations, the test suite, and other promised features. The partial code shown is high quality with good patterns (sentinel for missing values, callback firing outside lock, injectable time function "
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 1.85,
      "brief_justification": "The response is severely incomplete - it appears to be cut off mid-sentence and contains no actual implementation code, no tests, and none of the required features (TTL support, eviction callback, statistics, configurable max size). Only a brief description of the approach is provided without any working code."
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 6.75,
      "brief_justification": "The implementation has good structure with Builder pattern, eviction callbacks, and statistics. However, there are critical correctness issues: the lock upgrade pattern in get() (read->write) is incorrect and can cause deadlocks with ReentrantReadWriteLock (you can't upgrade from read to write lock directly). The updateAccessOrder() call happens while holding only a read lock, but modifies the LinkedHashSet which isn't thread-safe. The size() method calls cleanExpired() while holding a read lock"
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 6.75,
      "brief_justification": "The implementation is mostly correct but has several issues: (1) The _cleanup() call inside get() iterates all expiry_times making it O(n) not O(1), (2) None values are indistinguishable from cache misses since both return None, (3) _remove_key calls eviction_callback even for TTL expiration which conflates eviction with expiration, (4) The thread safety test assertion 'hits == 1000' is fragile since concurrent puts could evict keys before gets, (5) The test file is truncated. The code structure"
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 6.05,
      "brief_justification": "The implementation is well-structured with a clear design overview and good code quality, but the response is truncated mid-implementation (the `put` method is cut off). The test suite mentioned in the requirements is completely missing. The design choices are sound (doubly-linked list + hashmap for O(1), RLock for thread safety, lazy expiration), but the incomplete code and missing tests significantly reduce completeness and usefulness scores."
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 7.55,
      "brief_justification": "The implementation is well-structured and covers most requirements. However, there are notable issues: (1) `ttl_seconds=0` is treated as no TTL due to the falsy check `if ttl_seconds`, which is a bug; (2) lazy expiration only happens on `get()`, not on `put()` when checking capacity, meaning expired entries count toward capacity; (3) the thread safety test assertion `stats['size'] == 100` is fragile and may not reliably pass; (4) tests are somewhat minimal—no test for updating existing keys, no "
    },
    {
      "judge_key": "claude_sonnet_46",
      "judge_name": "Claude Sonnet 4.6",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 7,
      "brief_justification": "The implementation is well-structured with good design decisions documented. The core LRU+TTL logic is mostly correct with proper use of doubly linked list and min-heap. Thread safety via RLock is appropriate. However, there's a bug in _evict_one: when checking node.expiry against time.monotonic() after already confirming expiry <= time.monotonic() from the heap, the condition uses '>' which could fail in edge cases. The response is also truncated - the test suite is cut off mid-test, missing mo"
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 8.6,
      "brief_justification": "Implementation meets all specified requirements with correct O(1) operations, proper lazy expiration, thread-safety via ReentrantLock, eviction callbacks, and stats. Tests are partially shown but structure is comprehensive. Minor issues: clear() doesn't reset stats, potential callback ordering edge cases, and incomplete test code in response."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 6.05,
      "brief_justification": "Implementation is incomplete (get method is truncated, put/contains/size methods missing entirely), but the provided code shows good design with proper thread-safety, lazy expiration, stats, and O(1) structures. Some logical issues like using RLock unnecessarily and incomplete expiration handling during LRU eviction."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 3.6,
      "brief_justification": "The response is incomplete and cuts off mid-sentence. It describes high-level design choices but does not provide any actual code implementation, tests, or detailed explanations for all required features like TTL, lazy expiration, eviction callback, or statistics."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 6.4,
      "brief_justification": "Implementation is mostly correct and clear but incomplete (code is truncated at the end, missing _remove, _evict_lru, stats properties, full tests). Meets most requirements but lacks full functionality and test suite."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 6,
      "brief_justification": "Implementation has several bugs (cleanup called too often, stats incremented incorrectly, eviction callback called on explicit removes, incomplete test) but covers most requirements with reasonable structure."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 5.85,
      "brief_justification": "Implementation is mostly correct and clear but incomplete (code is truncated mid-line in put method, missing several required methods like delete, stats, and full eviction logic). Tests are mentioned but not provided. Design overview is solid but code doesn't fully deliver on all requirements."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 8,
      "brief_justification": "Implementation meets all core requirements with correct LRU+TTL mechanics, thread-safety, and tests. Minor issues: stats increment on expired gets counts as miss but increments after potential eviction; eviction callback not triggered on TTL expiry; capacity check uses len(self.cache) which is fine but could be optimized; overall solid and practical."
    },
    {
      "judge_key": "grok_420",
      "judge_name": "Grok 4.20",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 6.2,
      "brief_justification": "The implementation has several correctness issues: it uses a custom doubly-linked list but imports OrderedDict without using it; _evict_one only evicts one item even when called in a loop; heap can grow unbounded due to lazy deletion without cleanup; tests are truncated and incomplete. Design is mostly sound but flawed in practice. Code is well-structured and documented."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 9.6,
      "brief_justification": "The implementation meets all specified requirements with thorough testing and clear documentation."
    },
    {
      "judge_key": "deepseek_v4",
      "judge_name": "DeepSeek V4",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 8.8,
      "brief_justification": "Comprehensive implementation with all requested features, clear structure, and good depth. Minor deduction for depth as some edge cases could be explored further."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 4.85,
      "brief_justification": "The response provides a solid skeleton with clear design and thread‑safety, but the implementation is incomplete (truncated get/put methods) and lacks the required comprehensive tests, reducing overall correctness, completeness, and practical usefulness."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 2.85,
      "brief_justification": "Response is truncated and lacks the actual implementation, TTL handling, statistics, and tests, making it largely incomplete and of limited practical value."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 3.2,
      "brief_justification": "The answer outlines a design with clear documentation, but the implementation is incomplete (missing key methods, broken code) and provides no tests, making it largely non-functional."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 7,
      "brief_justification": "Implementation meets most functional requirements and is reasonably thread‑safe, but minor concurrency nuances and an incomplete test suite reduce overall correctness and completeness."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 7.55,
      "brief_justification": "Implementation meets most requirements and includes thorough tests, but cleanup scans all expiries (not strict O(1)) and one test is incomplete, slightly reducing correctness and completeness."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 8.8,
      "brief_justification": "Accurate implementation meeting all specs, thorough tests, clear code, good design insight, highly practical."
    },
    {
      "judge_key": "gpt_oss_120b",
      "judge_name": "GPT-OSS-120B",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 6.55,
      "brief_justification": "Implementation meets most requirements and is clear, but violates O(1) get/put due to heap use and the test suite is incomplete, reducing correctness and completeness."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 9.6,
      "brief_justification": "The implementation is robust, thread-safe, and correctly handles edge cases like overflow in TTL calculation. It uses a custom doubly linked list for O(1) performance as requested. While the test suite provided in the snippet was cut off at the very end, the logic within the code is complete and high-quality."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 9,
      "brief_justification": "The implementation is high-quality, using OrderedDict for O(1) LRU operations and RLock for thread safety. It correctly implements lazy expiration and provides detailed statistics. The response was cut off before the tests and the end of the get method, which affects completeness."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 4.3,
      "brief_justification": "The response is incomplete and cuts off mid-sentence. It fails to provide the actual implementation, tests, or the full logic required by the prompt."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 8.45,
      "brief_justification": "The implementation is robust, thread-safe, and follows the requirements well. However, the code provided is truncated at the very end (the keys() method is incomplete), and the 'comprehensive tests' requested were not included in the output."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 8.55,
      "brief_justification": "The implementation is solid and meets all requirements. It uses a ReentrantReadWriteLock for thread safety and a LinkedHashSet for O(1) LRU management. One minor correctness issue: updateAccessOrder is called inside a read lock in the get() method, but LinkedHashSet.remove/add are write operations, which would cause a ConcurrentModificationException or race condition if not upgraded to a write lock. The test suite is cut off at the end."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "deepseek_v4",
      "respondent_name": "DeepSeek V4",
      "weighted_score": 7.95,
      "brief_justification": "The implementation is clear and covers all requirements, including thread safety and stats. However, the O(1) requirement is violated in the _cleanup method, which iterates over all keys in the expiry dictionary on every get/put call, making those operations O(N). The tests are also cut off at the very end."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 7.85,
      "brief_justification": "The implementation is logically sound and follows the requirements well, including thread-safety and lazy expiration. However, the code provided is truncated (the 'put' method and tests are cut off), making it incomplete for immediate use."
    },
    {
      "judge_key": "gemini_3_flash",
      "judge_name": "Gemini 3 Flash Preview",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 9,
      "brief_justification": "The implementation is robust and correctly addresses all requirements. It uses a Doubly Linked List for LRU and a Min-Heap for TTL management with lazy invalidation. The code is thread-safe and includes a clear explanation of design decisions. The test suite is truncated at the very end, but the logic provided is high quality."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 6.85,
      "brief_justification": "Solid implementation with O(1) operations, thread-safety, TTL, and eviction callbacks. However, tests are incomplete (file ends abruptly at @Test void), and eviction callback for REPLACED is fired outside the lock which could cause race conditions."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "claude_opus_46",
      "respondent_name": "Claude Opus 4.6",
      "weighted_score": 4.55,
      "brief_justification": "Implementation is incomplete - the get method is cut off mid-sentence, put method is missing, and no tests are included. The partial code shows good design with proper TTL handling, statistics, and thread-safety, but the response fails to provide the core functionality requested."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "gemini_31_pro",
      "respondent_name": "Gemini 3.1 Pro",
      "weighted_score": 3.6,
      "brief_justification": "Response claims to provide a complete implementation but only offers a brief high-level explanation. No actual code or tests included. Explanation cuts off mid-sentence. The approach mentioned (hash map + doubly linked list) is correct for O(1) operations, but the response fails to deliver the promised implementation."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "claude_sonnet_46",
      "respondent_name": "Claude Sonnet 4.6",
      "weighted_score": 5.7,
      "brief_justification": "Implementation shows good understanding of requirements with O(1) operations, thread safety, lazy expiration, and statistics. However, the response is incomplete - the code cuts off mid-implementation (keys() method incomplete), and critically, no tests are provided despite the explicit requirement for comprehensive tests."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "grok_420",
      "respondent_name": "Grok 4.20",
      "weighted_score": 7.5,
      "brief_justification": "Good implementation covering all requirements with O(1) operations, thread-safety, lazy expiration, and statistics. However, has lock handling bugs in get() and containsKey() where read lock is improperly re-acquired after write lock release. Tests are incomplete (cut off)."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "gpt_oss_120b",
      "respondent_name": "GPT-OSS-120B",
      "weighted_score": 4.7,
      "brief_justification": "Response provides good design overview and starts implementation but cuts off mid-code (put method incomplete). No tests included despite claiming full test-suite. Implementation incomplete and unusable."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 8.55,
      "brief_justification": "Well-implemented LRU cache with all required features. Correct O(1) doubly-linked list + hash map approach, proper lazy TTL expiration, thread-safe with RLock. Minor edge case with capacity=0 not handled. Comprehensive tests cover all major functionality."
    },
    {
      "judge_key": "minimax_m25",
      "judge_name": "MiniMax M2.5",
      "respondent_key": "mimo_v2_flash",
      "respondent_name": "MiMo-V2-Flash",
      "weighted_score": 6.9,
      "brief_justification": ""
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "gpt_5_4",
      "respondent_name": "GPT-5.4",
      "weighted_score": 8.6,
      "brief_justification": "Implementation is accurate, covers all requirements, well-structured, and includes thorough tests, though some edge cases could be expanded."
    },
    {
      "judge_key": "mimo_v2_flash",
      "judge_name": "MiMo-V2-Flash",
      "respondent_key": "gemini_3_flash",
      "respondent_name": "Gemini 3 Flash Preview",
      "weighted_score": 9,
      "brief_justification": "Accurate implementation with O(1) operations, thread safety, lazy expiration, and comprehensive tests. Minor edge case: expired keys on put not explicitly handled, but tests cover core functionality."
    }
  ],
  "meta": {
    "source": "The Multivac (app.themultivac.com)",
    "methodology": "10x10 blind peer matrix evaluation",
    "criteria": "correctness, completeness, clarity, depth, usefulness",
    "self_judgments": "excluded from rankings",
    "license": "Open data — cite as: The Multivac (2026)"
  }
}