{
  "evaluation_id": "EVAL-20260317-022615",
  "question_id": "EVAL-20260317-022615",
  "question_text": "Implement an LRU cache with per-key TTL (time-to-live) support. Requirements: O(1) get/put, thread-safe, lazy expiration (don't use background threads), configurable max size, eviction callback, and cache hit/miss statistics. Include comprehensive tests.",
  "category": "code",
  "timestamp": "2026-03-17T02:26:15.283Z",
  "display_date": "Mar 17, 2026",
  "winner": {
    "name": "Qwen 3.5 35B-A3B",
    "provider": "openrouter",
    "score": 7.83
  },
  "avg_score": 6.6266665,
  "matrix_size": 37,
  "models_used": [
    {
      "id": "qwen3_8b",
      "name": "Qwen 3 8B",
      "provider": "openrouter"
    },
    {
      "id": "qwen3_coder_next",
      "name": "Qwen 3 Coder Next",
      "provider": "openrouter"
    },
    {
      "id": "qwen35_122b_a10b",
      "name": "Qwen 3.5 122B-A10B",
      "provider": "openrouter"
    },
    {
      "id": "qwen35_397b_a17b",
      "name": "Qwen 3.5 397B-A17B",
      "provider": "openrouter"
    },
    {
      "id": "qwen35_9b",
      "name": "Qwen 3.5 9B",
      "provider": "openrouter"
    },
    {
      "id": "qwen35_35b_a3b",
      "name": "Qwen 3.5 35B-A3B",
      "provider": "openrouter"
    }
  ],
  "rankings": {
    "qwen35_35b_a3b": {
      "display_name": "Qwen 3.5 35B-A3B",
      "provider": "openrouter",
      "average_score": 7.83,
      "score_count": 6,
      "min_score": 6.15,
      "max_score": 9.6,
      "rank": 1
    },
    "qwen3_coder_next": {
      "display_name": "Qwen 3 Coder Next",
      "provider": "openrouter",
      "average_score": 7.44,
      "score_count": 6,
      "min_score": 7,
      "max_score": 8.6,
      "rank": 2
    },
    "qwen3_8b": {
      "display_name": "Qwen 3 8B",
      "provider": "openrouter",
      "average_score": 7.22,
      "score_count": 5,
      "min_score": 4.8,
      "max_score": 8.8,
      "rank": 3
    },
    "qwen35_397b_a17b": {
      "display_name": "Qwen 3.5 397B-A17B",
      "provider": "openrouter",
      "average_score": 6.47,
      "score_count": 7,
      "min_score": 4.2,
      "max_score": 9,
      "rank": 4
    },
    "qwen35_122b_a10b": {
      "display_name": "Qwen 3.5 122B-A10B",
      "provider": "openrouter",
      "average_score": 6.18,
      "score_count": 7,
      "min_score": 4.4,
      "max_score": 7.8,
      "rank": 5
    },
    "qwen35_9b": {
      "display_name": "Qwen 3.5 9B",
      "provider": "openrouter",
      "average_score": 4.62,
      "score_count": 6,
      "min_score": 2.45,
      "max_score": 6.85,
      "rank": 6
    }
  },
  "judgments": [
    {
      "judge_key": "qwen35_9b",
      "judge_name": "Qwen 3.5 9B",
      "respondent_key": "qwen3_8b",
      "respondent_name": "Qwen 3 8B",
      "weighted_score": 7.55,
      "brief_justification": "Implementation is mostly correct with O(1) operations and thread safety, but has bugs: eviction callback receives stale values on TTL expiration, and expiration check in put() doesn't properly validate existing expired nodes. Tests are basic and lack edge case coverage. Code is well-structured and readable."
    },
    {
      "judge_key": "qwen35_9b",
      "judge_name": "Qwen 3.5 9B",
      "respondent_key": "qwen3_coder_next",
      "respondent_name": "Qwen 3 Coder Next",
      "weighted_score": 7,
      "brief_justification": "Implementation is technically correct with proper thread-safety and O(1) operations, but the response is truncated mid-test function, failing to deliver the requested comprehensive tests. The code cuts off at 'def test_ttl_expiration(self):' without completion."
    },
    {
      "judge_key": "qwen35_9b",
      "judge_name": "Qwen 3.5 9B",
      "respondent_key": "qwen35_122b_a10b",
      "respondent_name": "Qwen 3.5 122B-A10B",
      "weighted_score": 7,
      "brief_justification": "Implementation is mostly correct with proper thread safety and TTL support, but the test file is incomplete (cuts off mid-test). Some design choices like put() returning old_value are unconventional. Core functionality works but edge cases lack comprehensive testing."
    },
    {
      "judge_key": "qwen35_9b",
      "judge_name": "Qwen 3.5 9B",
      "respondent_key": "qwen35_397b_a17b",
      "respondent_name": "Qwen 3.5 397B-A17B",
      "weighted_score": 5.3,
      "brief_justification": "The implementation demonstrates correct design patterns (OrderedDict for O(1) operations, RLock for thread safety, lazy expiration), but the code is cut off mid-function and critically lacks the comprehensive tests explicitly required. Additionally, there are minor inconsistencies like the put() return type annotation not matching actual behavior."
    },
    {
      "judge_key": "qwen3_8b",
      "judge_name": "Qwen 3 8B",
      "respondent_key": "qwen35_9b",
      "respondent_name": "Qwen 3.5 9B",
      "weighted_score": 6.05,
      "brief_justification": "The implementation shows correct LRU logic but lacks critical methods like _add_to_head and _evict_lru. The code is well-structured but incomplete, making it non-functional as-is."
    },
    {
      "judge_key": "qwen3_8b",
      "judge_name": "Qwen 3 8B",
      "respondent_key": "qwen3_coder_next",
      "respondent_name": "Qwen 3 Coder Next",
      "weighted_score": 8.6,
      "brief_justification": "The implementation meets all requirements with thread-safety, per-key TTL, and LRU eviction. Tests are partially implemented but show comprehensive coverage. The code is well-structured but lacks full test cases for TTL expiration."
    },
    {
      "judge_key": "qwen3_8b",
      "judge_name": "Qwen 3 8B",
      "respondent_key": "qwen35_35b_a3b",
      "respondent_name": "Qwen 3.5 35B-A3B",
      "weighted_score": 9.6,
      "brief_justification": "The implementation meets all requirements with correct O(1) operations, thread safety, TTL handling, and comprehensive tests. The code is well-structured but could include more edge case handling in depth."
    },
    {
      "judge_key": "qwen3_8b",
      "judge_name": "Qwen 3 8B",
      "respondent_key": "qwen35_122b_a10b",
      "respondent_name": "Qwen 3.5 122B-A10B",
      "weighted_score": 7.35,
      "brief_justification": "The implementation addresses most requirements but fails to achieve true O(1) get/put due to list-based access order management. The cleanup_expired method is incomplete, and the code lacks proper handling of concurrent modifications."
    },
    {
      "judge_key": "qwen3_8b",
      "judge_name": "Qwen 3 8B",
      "respondent_key": "qwen35_397b_a17b",
      "respondent_name": "Qwen 3.5 397B-A17B",
      "weighted_score": 8.1,
      "brief_justification": "The implementation correctly handles LRU with per-key TTL, thread safety, and statistics. However, it lacks comprehensive tests as requested, reducing completeness and usefulness."
    },
    {
      "judge_key": "qwen3_32b",
      "judge_name": "Qwen 3 32B",
      "respondent_key": "qwen35_9b",
      "respondent_name": "Qwen 3.5 9B",
      "weighted_score": 5.45,
      "brief_justification": "The implementation addresses most requirements but is incomplete (missing linked list management code and tests). Thread safety and TTL logic are correct, but O(1) operations depend on unshown linked list implementation. Clarity is good but depth is limited by the incomplete solution."
    },
    {
      "judge_key": "qwen3_32b",
      "judge_name": "Qwen 3 32B",
      "respondent_key": "qwen3_8b",
      "respondent_name": "Qwen 3 8B",
      "weighted_score": 8.8,
      "brief_justification": "The implementation correctly handles LRU eviction, per-key TTL, thread safety, and statistics tracking. It includes comprehensive tests for various scenarios. Minor issues exist in the TTL handling logic when updating existing keys. The code is well-structured and practical for real-world use."
    },
    {
      "judge_key": "qwen3_32b",
      "judge_name": "Qwen 3 32B",
      "respondent_key": "qwen3_coder_next",
      "respondent_name": "Qwen 3 Coder Next",
      "weighted_score": 7.2,
      "brief_justification": "The implementation correctly combines LRU eviction with per-key TTL using OrderedDict and RLock for thread safety. However, it doesn't prioritize expired keys during eviction (only checks expiration on access), which could lead to unnecessary evictions of valid items. Tests are incomplete and missing key expiration validation during cache pressure scenarios."
    },
    {
      "judge_key": "qwen3_32b",
      "judge_name": "Qwen 3 32B",
      "respondent_key": "qwen35_35b_a3b",
      "respondent_name": "Qwen 3.5 35B-A3B",
      "weighted_score": 8.15,
      "brief_justification": "The implementation correctly handles most requirements (thread-safety, LRU eviction, stats, etc.) but has a correctness issue in _evict_lru: it doesn't account for expired items when checking max_size. Tests cover key scenarios but the custom clock test is incomplete. The design shows good understanding of data structures and concurrency."
    },
    {
      "judge_key": "qwen3_32b",
      "judge_name": "Qwen 3 32B",
      "respondent_key": "qwen35_122b_a10b",
      "respondent_name": "Qwen 3.5 122B-A10B",
      "weighted_score": 6.4,
      "brief_justification": "The implementation meets most requirements but uses a list for access_order which breaks O(1) complexity for remove/append operations. Thread safety is handled with RLock but list operations aren't O(1). Tests are incomplete and the cleanup_expired method isn't used in tests. The design lacks efficient data structures for true O(1) LRU tracking."
    },
    {
      "judge_key": "qwen3_32b",
      "judge_name": "Qwen 3 32B",
      "respondent_key": "qwen35_397b_a17b",
      "respondent_name": "Qwen 3.5 397B-A17B",
      "weighted_score": 8.6,
      "brief_justification": "The implementation correctly addresses most requirements with O(1) operations, thread safety via RLock, lazy expiration, and statistics tracking. The code is well-structured and clear, though some methods are cut off mid-implementation. It lacks comprehensive tests as requested but provides a solid foundation for an LRU cache with per-key TTL."
    },
    {
      "judge_key": "qwen3_coder_next",
      "judge_name": "Qwen 3 Coder Next",
      "respondent_key": "qwen35_9b",
      "respondent_name": "Qwen 3.5 9B",
      "weighted_score": 6.85,
      "brief_justification": "The implementation is mostly correct and well-structured, but the code is incomplete (missing `_evict_lru`, `_move_to_head`, and `_add_to_head` methods), and lacks comprehensive tests as required. Thread safety is addressed with RLock, but the implementation doesn't handle concurrent access to the doubly-linked list properly in all cases. TTL and eviction logic are sound, but edge cases like zero/negative TTL and integer overflow over long periods are not considered."
    },
    {
      "judge_key": "qwen3_coder_next",
      "judge_name": "Qwen 3 Coder Next",
      "respondent_key": "qwen3_8b",
      "respondent_name": "Qwen 3 8B",
      "weighted_score": 8.2,
      "brief_justification": "The implementation is mostly correct and covers all required features, but has a subtle bug: in `put()` when updating an existing non-expired node, it doesn't check if the new TTL should reset expiration time properly (it overwrites without considering existing TTL). Also, the eviction callback is called on TTL expiration during `get()`, but not consistently for all eviction paths. Tests are comprehensive but lack assertions—just print statements. Thread safety is implemented correctly with lock"
    },
    {
      "judge_key": "qwen3_coder_next",
      "judge_name": "Qwen 3 Coder Next",
      "respondent_key": "qwen35_35b_a3b",
      "respondent_name": "Qwen 3.5 35B-A3B",
      "weighted_score": 9.55,
      "brief_justification": "The implementation is mostly correct and comprehensive, covering all required features with proper thread safety, O(1) operations via doubly-linked list + dict, lazy TTL expiration, and statistics tracking. However, the test suite is incomplete (cut off mid-test for custom clock) and the eviction logic in put() may evict before inserting a new key even when cache is at capacity, which could cause off-by-one errors in edge cases. Also, the hit/miss counting on TTL expiration double-counts as evic"
    },
    {
      "judge_key": "qwen3_coder_next",
      "judge_name": "Qwen 3 Coder Next",
      "respondent_key": "qwen35_122b_a10b",
      "respondent_name": "Qwen 3.5 122B-A10B",
      "weighted_score": 7.8,
      "brief_justification": "The implementation is mostly correct and well-structured, but has a critical bug: `RLock` is used without being imported (should be `threading.RLock`). The `get()` method incorrectly increments 'misses' twice when an entry expires. The test for `cleanup_expired()` is incomplete (truncated mid-test), and the implementation lacks O(1) operations — `access_order.remove()` is O(n), violating the O(1) requirement. Otherwise, thread-safety, TTL, eviction callback, and stats are reasonably implemented."
    },
    {
      "judge_key": "qwen3_coder_next",
      "judge_name": "Qwen 3 Coder Next",
      "respondent_key": "qwen35_397b_a17b",
      "respondent_name": "Qwen 3.5 397B-A17B",
      "weighted_score": 9,
      "brief_justification": "The implementation is mostly correct and well-structured, with O(1) operations using OrderedDict, thread safety via RLock, lazy expiration, TTL per key, eviction callbacks, and statistics. However, the response is cut off mid-_evict_oldest method, and the eviction callback is invoked only for explicit removals/clears but not for size-based evictions (missing callback invocation in _evict_oldest). Also, the put method doesn't properly handle TTL=0 (immediate expiration) and doesn't invoke evictio"
    },
    {
      "judge_key": "qwen35_35b_a3b",
      "judge_name": "Qwen 3.5 35B-A3B",
      "respondent_key": "qwen35_9b",
      "respondent_name": "Qwen 3.5 9B",
      "weighted_score": 3.45,
      "brief_justification": "The response is critically incomplete as the code truncates mid-function and the promised tests are missing entirely. While the visible structure is clear, the solution cannot be executed or verified."
    },
    {
      "judge_key": "qwen35_35b_a3b",
      "judge_name": "Qwen 3.5 35B-A3B",
      "respondent_key": "qwen3_coder_next",
      "respondent_name": "Qwen 3 Coder Next",
      "weighted_score": 7.65,
      "brief_justification": "The implementation is well-structured with correct O(1) operations, thread safety, and TTL support. However, the test suite is incomplete (cutsoff mid-test at test_ttl_expiration), which significantly impacts completeness and usefulness since comprehensive tests were explicitly requested."
    },
    {
      "judge_key": "qwen35_35b_a3b",
      "judge_name": "Qwen 3.5 35B-A3B",
      "respondent_key": "qwen35_122b_a10b",
      "respondent_name": "Qwen 3.5 122B-A10B",
      "weighted_score": 5.15,
      "brief_justification": "The implementation has critical issues: uses list.remove() and pop(0) which are O(n) not O(1) as required, missing RLock import (uses undefined RLock), and testsare truncated mid-test. While the structure is clear and covers most requirements, the O(1) constraint violation and incomplete test suite significantly impact correctness and completeness."
    },
    {
      "judge_key": "qwen35_35b_a3b",
      "judge_name": "Qwen 3.5 35B-A3B",
      "respondent_key": "qwen35_397b_a17b",
      "respondent_name": "Qwen 3.5 397B-A17B",
      "weighted_score": 5.4,
      "brief_justification": "The implementation logic is sound but the response is truncated mid-function, preventing full validation. Additionally, it completely fails to include the requested comprehensive tests."
    },
    {
      "judge_key": "qwen35_27b",
      "judge_name": "Qwen 3.5 27B",
      "respondent_key": "qwen35_9b",
      "respondent_name": "Qwen 3.5 9B",
      "weighted_score": 3.45,
      "brief_justification": "The response is severely incomplete - code cuts off mid-function, critical methods (_move_to_head, _evict_lru, _add_to_head) are undefined, and no tests are provided despite claiming comprehensive tests. The partial code shown has some logical issues and redundant data structures."
    },
    {
      "judge_key": "qwen35_27b",
      "judge_name": "Qwen 3.5 27B",
      "respondent_key": "qwen3_coder_next",
      "respondent_name": "Qwen 3 Coder Next",
      "weighted_score": 7.1,
      "brief_justification": "Implementation is technically sound with correct LRU logic, thread-safety, and TTL handling, but the response is cut off mid-test suite (test_ttl_expiration incomplete), missing critical test coverage for TTL expiration, thread-safety, and eviction callback functionality."
    },
    {
      "judge_key": "qwen35_27b",
      "judge_name": "Qwen 3.5 27B",
      "respondent_key": "qwen35_35b_a3b",
      "respondent_name": "Qwen 3.5 35B-A3B",
      "weighted_score": 6.2,
      "brief_justification": "The implementation has critical bugs: CacheNode dataclass lacks prev/next attributes causing runtime errors, eviction condition uses <= instead of >= allowing cache to exceed max_size, and test assertions are incorrect (test_concurrent_access expects 1000 items but only 100 unique keys exist). Structure and documentation are good, but code won't work as-is."
    },
    {
      "judge_key": "qwen35_27b",
      "judge_name": "Qwen 3.5 27B",
      "respondent_key": "qwen35_122b_a10b",
      "respondent_name": "Qwen 3.5 122B-A10B",
      "weighted_score": 5.15,
      "brief_justification": "The implementation violates the O(1) complexity constraint by using Python lists for LRU tracking (O(N) operations). Additionally, the test suite is syntactically incomplete (truncated at the end), and the O(1) requirement is a critical failure for this specific request."
    },
    {
      "judge_key": "qwen35_27b",
      "judge_name": "Qwen 3.5 27B",
      "respondent_key": "qwen35_397b_a17b",
      "respondent_name": "Qwen 3.5 397B-A17B",
      "weighted_score": 4.2,
      "brief_justification": "The implementation is truncated and references undefined methods, rendering it non-functional, and it completely omits the requested comprehensive tests."
    },
    {
      "judge_key": "qwen35_122b_a10b",
      "judge_name": "Qwen 3.5 122B-A10B",
      "respondent_key": "qwen3_8b",
      "respondent_name": "Qwen 3 8B",
      "weighted_score": 6.75,
      "brief_justification": "The code fails to compile due to a missing import for Consumer and contains a logic bug in put() where expired existing keys are removed but not updated with the new value."
    },
    {
      "judge_key": "qwen35_122b_a10b",
      "judge_name": "Qwen 3.5 122B-A10B",
      "respondent_key": "qwen3_coder_next",
      "respondent_name": "Qwen 3 Coder Next",
      "weighted_score": 7.1,
      "brief_justification": "Implementation is well-structured with correct O(1) operations, thread-safety, and lazy expiration. However, tests are significantly incomplete (cut off mid-method), and __contains__ has a bug where None values could be mistaken for expired keys."
    },
    {
      "judge_key": "qwen35_122b_a10b",
      "judge_name": "Qwen 3.5 122B-A10B",
      "respondent_key": "qwen35_35b_a3b",
      "respondent_name": "Qwen 3.5 35B-A3B",
      "weighted_score": 7.35,
      "brief_justification": "The implementation contains critical logic errors: the eviction condition (`<=`) allows the cache to exceed `max_size`, and the concurrent test expects 1000 items despite overlapping keys. While the structure and feature coverage are comprehensive, these bugs prevent immediate production use."
    },
    {
      "judge_key": "qwen35_122b_a10b",
      "judge_name": "Qwen 3.5 122B-A10B",
      "respondent_key": "qwen35_397b_a17b",
      "respondent_name": "Qwen 3.5 397B-A17B",
      "weighted_score": 4.7,
      "brief_justification": "The implementation design is sound and well-structured, but the response is truncated mid-function, omits critical helper methods, and fails to include the requested comprehensive tests."
    },
    {
      "judge_key": "qwen35_397b_a17b",
      "judge_name": "Qwen 3.5 397B-A17B",
      "respondent_key": "qwen35_9b",
      "respondent_name": "Qwen 3.5 9B",
      "weighted_score": 2.45,
      "brief_justification": "The code is truncated mid-function and lacks critical methods (_evict_lru, _move_to_head) and the requested comprehensive tests, making it non-functional."
    },
    {
      "judge_key": "qwen35_397b_a17b",
      "judge_name": "Qwen 3.5 397B-A17B",
      "respondent_key": "qwen3_8b",
      "respondent_name": "Qwen 3 8B",
      "weighted_score": 4.8,
      "brief_justification": "Critical bug in `put` logic causes data loss when updating expired keys, and missing imports prevent compilation. Tests lack assertions and fail to meet the comprehensive requirement."
    },
    {
      "judge_key": "qwen35_397b_a17b",
      "judge_name": "Qwen 3.5 397B-A17B",
      "respondent_key": "qwen35_35b_a3b",
      "respondent_name": "Qwen 3.5 35B-A3B",
      "weighted_score": 6.15,
      "brief_justification": "The implementation contains logic bugs (eviction threshold allows exceeding max size) and inconsistencies (missing eviction callback on TTL expiration). Additionally, the test suite is incomplete and the data model lacks explicit linked-list attributes."
    },
    {
      "judge_key": "qwen35_397b_a17b",
      "judge_name": "Qwen 3.5 397B-A17B",
      "respondent_key": "qwen35_122b_a10b",
      "respondent_name": "Qwen 3.5 122B-A10B",
      "weighted_score": 4.4,
      "brief_justification": "The implementation violates the O(1) requirement using O(N) list operations, contains a NameError due to a missing RLock import, and the test suite is abruptly truncated. While the code structure is clear, these critical failures prevent it from being functional or production-ready."
    }
  ],
  "meta": {
    "source": "The Multivac (app.themultivac.com)",
    "methodology": "10x10 blind peer matrix evaluation",
    "criteria": "correctness, completeness, clarity, depth, usefulness",
    "self_judgments": "excluded from rankings",
    "license": "Open data — cite as: The Multivac (2026)"
  }
}