{
  "evaluation_id": "EVAL-20260315-054647",
  "question_id": "EVAL-20260315-054647",
  "question_text": "Implement an LRU cache with per-key TTL (time-to-live) support. Requirements: O(1) get/put, thread-safe, lazy expiration (don't use background threads), configurable max size, eviction callback, and cache hit/miss statistics. Include comprehensive tests.",
  "category": "code",
  "timestamp": "2026-03-15T05:46:47.476Z",
  "display_date": "Mar 15, 2026",
  "winner": {
    "name": "Qwen 3 8B",
    "provider": "openrouter",
    "score": 9.23
  },
  "avg_score": 8.167,
  "matrix_size": 66,
  "models_used": [
    {
      "id": "devstral",
      "name": "Devstral Small",
      "provider": "openrouter"
    },
    {
      "id": "gemma3_27b",
      "name": "Gemma 3 27B",
      "provider": "openrouter"
    },
    {
      "id": "llama4_scout",
      "name": "Llama 4 Scout",
      "provider": "openrouter"
    },
    {
      "id": "phi4",
      "name": "Phi-4 14B",
      "provider": "openrouter"
    },
    {
      "id": "granite_40",
      "name": "Granite 4.0 Micro",
      "provider": "openrouter"
    },
    {
      "id": "qwen3_8b",
      "name": "Qwen 3 8B",
      "provider": "openrouter"
    },
    {
      "id": "mistral_nemo",
      "name": "Mistral Nemo 12B",
      "provider": "openrouter"
    },
    {
      "id": "llama31_8b",
      "name": "Llama 3.1 8B",
      "provider": "openrouter"
    },
    {
      "id": "qwen3_32b",
      "name": "Qwen 3 32B",
      "provider": "openrouter"
    },
    {
      "id": "kimi_k25",
      "name": "Kimi K2.5",
      "provider": "openrouter"
    }
  ],
  "rankings": {
    "qwen3_8b": {
      "display_name": "Qwen 3 8B",
      "provider": "openrouter",
      "average_score": 9.23,
      "score_count": 8,
      "min_score": 8.25,
      "max_score": 10,
      "rank": 1
    },
    "gemma3_27b": {
      "display_name": "Gemma 3 27B",
      "provider": "openrouter",
      "average_score": 9.12,
      "score_count": 8,
      "min_score": 8.35,
      "max_score": 9.6,
      "rank": 2
    },
    "llama4_scout": {
      "display_name": "Llama 4 Scout",
      "provider": "openrouter",
      "average_score": 8.57,
      "score_count": 7,
      "min_score": 8.2,
      "max_score": 9.6,
      "rank": 3
    },
    "granite_40": {
      "display_name": "Granite 4.0 Micro",
      "provider": "openrouter",
      "average_score": 8.4,
      "score_count": 8,
      "min_score": 6.8,
      "max_score": 9.6,
      "rank": 4
    },
    "devstral": {
      "display_name": "Devstral Small",
      "provider": "openrouter",
      "average_score": 8.29,
      "score_count": 8,
      "min_score": 7.65,
      "max_score": 8.8,
      "rank": 5
    },
    "qwen3_32b": {
      "display_name": "Qwen 3 32B",
      "provider": "openrouter",
      "average_score": 8.15,
      "score_count": 1,
      "min_score": 8.15,
      "max_score": 8.15,
      "rank": 6
    },
    "kimi_k25": {
      "display_name": "Kimi K2.5",
      "provider": "openrouter",
      "average_score": 8.15,
      "score_count": 1,
      "min_score": 8.15,
      "max_score": 8.15,
      "rank": 7
    },
    "phi4": {
      "display_name": "Phi-4 14B",
      "provider": "openrouter",
      "average_score": 8.14,
      "score_count": 9,
      "min_score": 5.4,
      "max_score": 9.6,
      "rank": 8
    },
    "llama31_8b": {
      "display_name": "Llama 3.1 8B",
      "provider": "openrouter",
      "average_score": 7.45,
      "score_count": 8,
      "min_score": 4.8,
      "max_score": 8.8,
      "rank": 9
    },
    "mistral_nemo": {
      "display_name": "Mistral Nemo 12B",
      "provider": "openrouter",
      "average_score": 6.18,
      "score_count": 8,
      "min_score": 3.6,
      "max_score": 8.8,
      "rank": 10
    }
  },
  "judgments": [
    {
      "judge_key": "qwen3_32b",
      "judge_name": "Qwen 3 32B",
      "respondent_key": "devstral",
      "respondent_name": "Devstral Small",
      "weighted_score": 8.05,
      "brief_justification": "The implementation correctly handles O(1) get/put, thread safety via Lock, lazy expiration on access, and includes eviction callback/stats. However, tests are minimal and lack comprehensive edge case coverage (e.g., concurrent access, TTL expiration during eviction)."
    },
    {
      "judge_key": "qwen3_32b",
      "judge_name": "Qwen 3 32B",
      "respondent_key": "gemma3_27b",
      "respondent_name": "Gemma 3 27B",
      "weighted_score": 8.35,
      "brief_justification": "The implementation meets most requirements with thread safety, TTL, O(1) operations, and tests. However, the eviction logic may not prioritize expired items over LRU items, potentially leaving expired entries in the cache. Tests are comprehensive but could verify eviction order and edge cases more thoroughly."
    },
    {
      "judge_key": "qwen3_32b",
      "judge_name": "Qwen 3 32B",
      "respondent_key": "llama4_scout",
      "respondent_name": "Llama 4 Scout",
      "weighted_score": 8.4,
      "brief_justification": "The implementation correctly handles LRU eviction, per-key TTL, thread-safety, and statistics tracking. Tests cover basic functionality but could include more edge cases (e.g., concurrent access, mixed TTL/eviction scenarios)."
    },
    {
      "judge_key": "qwen3_32b",
      "judge_name": "Qwen 3 32B",
      "respondent_key": "phi4",
      "respondent_name": "Phi-4 14B",
      "weighted_score": 8.25,
      "brief_justification": "The implementation correctly addresses all core requirements with O(1) operations, thread safety, and lazy expiration. Tests cover basic scenarios but lack edge cases (e.g., concurrent access, complex eviction sequences). The code is clear and well-structured, though the depth of explanation could explore alternative design trade-offs."
    },
    {
      "judge_key": "qwen3_32b",
      "judge_name": "Qwen 3 32B",
      "respondent_key": "granite_40",
      "respondent_name": "Granite 4.0 Micro",
      "weighted_score": 7,
      "brief_justification": "The implementation correctly handles LRU eviction and TTL tracking during eviction but fails to check TTL during `get` operations, allowing expired entries to be returned. Tests are comprehensive but miss this critical flaw."
    },
    {
      "judge_key": "qwen3_32b",
      "judge_name": "Qwen 3 32B",
      "respondent_key": "qwen3_8b",
      "respondent_name": "Qwen 3 8B",
      "weighted_score": 8.4,
      "brief_justification": "The implementation is mostly correct and meets most requirements, including O(1) operations, thread safety, lazy expiration, eviction callback, and statistics. However, the use of OrderedDict for LRU tracking may not be optimal in Python 3.7+ where dicts are insertion-ordered by default. The tests cover key scenarios but could include more edge cases. The code is well-documented and structured."
    },
    {
      "judge_key": "qwen3_32b",
      "judge_name": "Qwen 3 32B",
      "respondent_key": "mistral_nemo",
      "respondent_name": "Mistral Nemo 12B",
      "weighted_score": 3.8,
      "brief_justification": "The implementation has critical flaws: TTL expiration is not enforced during get/put operations, the eviction logic doesn't check expiration status, and the is_expired method contains a logical error (comparing time.time() - ttl_cache[key] > ttl_cache[key]). Missing required features like eviction callback and statistics tracking. Tests are incomplete and don't validate expiration behavior."
    },
    {
      "judge_key": "qwen3_32b",
      "judge_name": "Qwen 3 32B",
      "respondent_key": "llama31_8b",
      "respondent_name": "Llama 3.1 8B",
      "weighted_score": 4.8,
      "brief_justification": "The implementation has significant correctness issues with TTL handling - it doesn't properly track expiration times or check for expired entries. The code structure is mostly clear but lacks proper lazy expiration logic. Tests are incomplete and don't validate the core TTL functionality."
    },
    {
      "judge_key": "kimi_k25",
      "judge_name": "Kimi K2.5",
      "respondent_key": "phi4",
      "respondent_name": "Phi-4 14B",
      "weighted_score": 5.4,
      "brief_justification": "The implementation has critical bugs: expired entries are not removed from the cache during get() operations, causing memory leaks and potentially evicting valid items while expired ones remain; miss_count is incorrectly incremented during eviction in put(); and the provided tests contain incorrect assertions that would fail if executed. While the structure and O(1) approach using OrderedDict are correct, these functional defects make it unsuitable for production use."
    },
    {
      "judge_key": "devstral",
      "judge_name": "Devstral Small",
      "respondent_key": "gemma3_27b",
      "respondent_name": "Gemma 3 27B",
      "weighted_score": 9.6,
      "brief_justification": "The response provides a comprehensive implementation of an LRU cache with per-key TTL support, meeting all requirements. The code is well-structured, thread-safe, and includes thorough tests."
    },
    {
      "judge_key": "devstral",
      "judge_name": "Devstral Small",
      "respondent_key": "llama4_scout",
      "respondent_name": "Llama 4 Scout",
      "weighted_score": 8.2,
      "brief_justification": "The implementation is mostly correct and complete, with clear code and useful functionality. However, it lacks some depth in explaining advanced features and edge cases."
    },
    {
      "judge_key": "devstral",
      "judge_name": "Devstral Small",
      "respondent_key": "phi4",
      "respondent_name": "Phi-4 14B",
      "weighted_score": 8.6,
      "brief_justification": "The implementation meets all requirements with O(1) operations, thread safety, and lazy expiration. The code is well-structured and includes comprehensive tests."
    },
    {
      "judge_key": "devstral",
      "judge_name": "Devstral Small",
      "respondent_key": "granite_40",
      "respondent_name": "Granite 4.0 Micro",
      "weighted_score": 8.6,
      "brief_justification": "The implementation is mostly correct and complete, with clear explanations and useful features. Minor improvements could be made in code clarity and depth of analysis."
    },
    {
      "judge_key": "devstral",
      "judge_name": "Devstral Small",
      "respondent_key": "qwen3_8b",
      "respondent_name": "Qwen 3 8B",
      "weighted_score": 10,
      "brief_justification": "The response provides a comprehensive, thread-safe LRU cache implementation with per-key TTL, lazy expiration, eviction callbacks, and hit/miss statistics. The code is well-structured, thoroughly tested, and meets all specified requirements."
    },
    {
      "judge_key": "devstral",
      "judge_name": "Devstral Small",
      "respondent_key": "mistral_nemo",
      "respondent_name": "Mistral Nemo 12B",
      "weighted_score": 5.8,
      "brief_justification": "The implementation is mostly correct but has some issues, such as incorrect TTL calculation and missing eviction callback. The completeness is low due to missing features like eviction callback and cache hit/miss statistics. The code is clear but lacks depth in explaining the design choices. The usefulness is moderate as it provides a basic LRU cache but lacks some required features."
    },
    {
      "judge_key": "devstral",
      "judge_name": "Devstral Small",
      "respondent_key": "llama31_8b",
      "respondent_name": "Llama 3.1 8B",
      "weighted_score": 7.4,
      "brief_justification": "The implementation is mostly correct and meets the requirements, but the TTL handling is not fully implemented. The code is clear and well-structured, but lacks depth in some areas, such as detailed error handling and more comprehensive tests."
    },
    {
      "judge_key": "gemma3_27b",
      "judge_name": "Gemma 3 27B",
      "respondent_key": "devstral",
      "respondent_name": "Devstral Small",
      "weighted_score": 8.25,
      "brief_justification": "The implementation correctly addresses the core requirements of an LRU cache with TTL and thread safety. While the tests are basic, they demonstrate functionality; more comprehensive tests would improve completeness. The explanation is clear and well-organized."
    },
    {
      "judge_key": "gemma3_27b",
      "judge_name": "Gemma 3 27B",
      "respondent_key": "llama4_scout",
      "respondent_name": "Llama 4 Scout",
      "weighted_score": 8.6,
      "brief_justification": "The code is well-structured, thread-safe, and implements the requested features. The tests are comprehensive and cover the main functionalities. Could benefit from more detailed explanation of the time complexity of operations."
    },
    {
      "judge_key": "gemma3_27b",
      "judge_name": "Gemma 3 27B",
      "respondent_key": "phi4",
      "respondent_name": "Phi-4 14B",
      "weighted_score": 8.4,
      "brief_justification": "The implementation is largely correct and addresses all requirements. The code is well-structured and easy to understand. It could benefit from more sophisticated error handling and potentially a more robust eviction strategy, but it's a solid foundation."
    },
    {
      "judge_key": "gemma3_27b",
      "judge_name": "Gemma 3 27B",
      "respondent_key": "granite_40",
      "respondent_name": "Granite 4.0 Micro",
      "weighted_score": 8.8,
      "brief_justification": "The response provides a well-structured and functional LRU cache implementation with TTL, thread safety, and statistics. The tests are comprehensive and cover key aspects of the cache's functionality. Could benefit from more discussion of potential edge cases or performance optimizations."
    },
    {
      "judge_key": "gemma3_27b",
      "judge_name": "Gemma 3 27B",
      "respondent_key": "qwen3_8b",
      "respondent_name": "Qwen 3 8B",
      "weighted_score": 9.4,
      "brief_justification": "The implementation is correct, complete, and well-documented. The tests are comprehensive and cover edge cases. The explanation is clear, though could benefit from a deeper dive into potential performance considerations."
    },
    {
      "judge_key": "gemma3_27b",
      "judge_name": "Gemma 3 27B",
      "respondent_key": "mistral_nemo",
      "respondent_name": "Mistral Nemo 12B",
      "weighted_score": 6.6,
      "brief_justification": "The code provides a functional LRU cache with TTL, thread safety, and basic eviction. However, it lacks an eviction callback and hit/miss statistics as requested. The `is_expired` function is defined but not used in `get` or `put` for lazy expiration, making the TTL implementation incomplete."
    },
    {
      "judge_key": "gemma3_27b",
      "judge_name": "Gemma 3 27B",
      "respondent_key": "llama31_8b",
      "respondent_name": "Llama 3.1 8B",
      "weighted_score": 8.4,
      "brief_justification": "The code is functionally correct, thread-safe, and includes tests. It could benefit from more sophisticated TTL handling (e.g., a background process for expiration, though explicitly avoided in the prompt) and more comprehensive tests, but it meets the core requirements well."
    },
    {
      "judge_key": "llama4_scout",
      "judge_name": "Llama 4 Scout",
      "respondent_key": "devstral",
      "respondent_name": "Devstral Small",
      "weighted_score": 8.4,
      "brief_justification": "The implementation is largely correct and clear, meeting most requirements. It could benefit from more comprehensive tests and detailed documentation."
    },
    {
      "judge_key": "llama4_scout",
      "judge_name": "Llama 4 Scout",
      "respondent_key": "gemma3_27b",
      "respondent_name": "Gemma 3 27B",
      "weighted_score": 9.6,
      "brief_justification": "The provided implementation meets all requirements, including O(1) get/put, thread-safety, lazy expiration, configurable max size, eviction callback, and cache hit/miss statistics. The code is well-structured, readable, and includes comprehensive tests."
    },
    {
      "judge_key": "llama4_scout",
      "judge_name": "Llama 4 Scout",
      "respondent_key": "phi4",
      "respondent_name": "Phi-4 14B",
      "weighted_score": 8.8,
      "brief_justification": "The implementation is mostly correct and covers all the required features, but could be improved with more edge cases in tests."
    },
    {
      "judge_key": "llama4_scout",
      "judge_name": "Llama 4 Scout",
      "respondent_key": "granite_40",
      "respondent_name": "Granite 4.0 Micro",
      "weighted_score": 8.8,
      "brief_justification": "The implementation is largely correct and complete, with a clear explanation and comprehensive tests. However, there are minor areas for improvement in clarity and depth."
    },
    {
      "judge_key": "llama4_scout",
      "judge_name": "Llama 4 Scout",
      "respondent_key": "qwen3_8b",
      "respondent_name": "Qwen 3 8B",
      "weighted_score": 9.6,
      "brief_justification": "The response provides a comprehensive implementation of an LRU cache with per-key TTL support, meeting all requirements specified. The code is well-structured, and the test suite is thorough."
    },
    {
      "judge_key": "llama4_scout",
      "judge_name": "Llama 4 Scout",
      "respondent_key": "mistral_nemo",
      "respondent_name": "Mistral Nemo 12B",
      "weighted_score": 5.6,
      "brief_justification": "The implementation is mostly correct but has some logical errors. It lacks comprehensive cache hit/miss statistics and an eviction callback. The provided implementation does not correctly handle TTL expiration and does not provide lazy expiration without background threads."
    },
    {
      "judge_key": "llama4_scout",
      "judge_name": "Llama 4 Scout",
      "respondent_key": "llama31_8b",
      "respondent_name": "Llama 3.1 8B",
      "weighted_score": 8.6,
      "brief_justification": "The implementation is mostly correct and complete, meeting the requirements of an LRU cache with per-key TTL support, O(1) get/put, thread-safety, lazy expiration, and cache hit/miss statistics. However, there are some minor issues with TTL implementation and eviction callback handling."
    },
    {
      "judge_key": "phi4",
      "judge_name": "Phi-4 14B",
      "respondent_key": "devstral",
      "respondent_name": "Devstral Small",
      "weighted_score": 8.25,
      "brief_justification": "The implementation correctly handles LRU eviction with per-key TTL and is thread-safe. It includes cache hit/miss statistics and lazy expiration. However, it lacks comprehensive testing for edge cases and does not address cache initialization or cleanup."
    },
    {
      "judge_key": "phi4",
      "judge_name": "Phi-4 14B",
      "respondent_key": "gemma3_27b",
      "respondent_name": "Gemma 3 27B",
      "weighted_score": 9.55,
      "brief_justification": "The response correctly implements an LRU cache with per-key TTL, thread safety, and other specified features. It includes comprehensive tests covering various scenarios. The implementation is clear and well-documented. The only minor issue is the lack of explicit handling of edge cases like negative TTL values, but overall, it meets the requirements effectively."
    },
    {
      "judge_key": "phi4",
      "judge_name": "Phi-4 14B",
      "respondent_key": "llama4_scout",
      "respondent_name": "Llama 4 Scout",
      "weighted_score": 8.45,
      "brief_justification": "The implementation correctly addresses the requirements for an LRU cache with per-key TTL, thread safety, lazy expiration, and cache statistics. The code is clear and well-structured, with comprehensive tests covering most aspects. However, the tests could include more edge cases, such as concurrent access scenarios, to fully validate thread safety."
    },
    {
      "judge_key": "phi4",
      "judge_name": "Phi-4 14B",
      "respondent_key": "granite_40",
      "respondent_name": "Granite 4.0 Micro",
      "weighted_score": 9,
      "brief_justification": "The response accurately implements an LRU cache with TTL, thread safety, and eviction callback, and includes comprehensive tests. It covers all requirements except for a minor potential improvement in handling edge cases for TTL expiration. The explanation and tests are clear and thorough."
    },
    {
      "judge_key": "phi4",
      "judge_name": "Phi-4 14B",
      "respondent_key": "qwen3_8b",
      "respondent_name": "Qwen 3 8B",
      "weighted_score": 9.8,
      "brief_justification": "The response provides a detailed and correct implementation of an LRU cache with per-key TTL support, ensuring thread safety and lazy expiration. It includes a thorough explanation of the data structures used, key features, and comprehensive tests covering various scenarios. The response is clear, structured, and actionable, with practical value demonstrated through the test suite."
    },
    {
      "judge_key": "phi4",
      "judge_name": "Phi-4 14B",
      "respondent_key": "mistral_nemo",
      "respondent_name": "Mistral Nemo 12B",
      "weighted_score": 6.6,
      "brief_justification": "The implementation covers basic LRU and TTL functionalities but lacks eviction callback, cache hit/miss statistics, and does not fully ensure lazy expiration without background threads. The code is clear and well-structured but incomplete in meeting all specified requirements."
    },
    {
      "judge_key": "phi4",
      "judge_name": "Phi-4 14B",
      "respondent_key": "llama31_8b",
      "respondent_name": "Llama 3.1 8B",
      "weighted_score": 8.05,
      "brief_justification": "The implementation correctly handles LRU cache with per-key TTL, thread safety, and statistics. However, it lacks explicit O(1) complexity guarantees and could improve clarity in documentation and tests."
    },
    {
      "judge_key": "granite_40",
      "judge_name": "Granite 4.0 Micro",
      "respondent_key": "qwen3_32b",
      "respondent_name": "Qwen 3 32B",
      "weighted_score": 8.15,
      "brief_justification": "The response outlines the key requirements for an LRU cache with TTL, thread-safety, and statistics, but lacks concrete implementation code and detailed test examples, resulting in high correctness and completeness scores but lower scores for depth and clarity due to the absence of actual code and more in-depth analysis."
    },
    {
      "judge_key": "granite_40",
      "judge_name": "Granite 4.0 Micro",
      "respondent_key": "kimi_k25",
      "respondent_name": "Kimi K2.5",
      "weighted_score": 8.15,
      "brief_justification": "The response outlines the key requirements for an LRU cache with TTL, but lacks concrete implementation code, detailed test cases, and specific examples, making it less complete and lacking depth in practical implementation details."
    },
    {
      "judge_key": "granite_40",
      "judge_name": "Granite 4.0 Micro",
      "respondent_key": "devstral",
      "respondent_name": "Devstral Small",
      "weighted_score": 8.8,
      "brief_justification": "The response provides a comprehensive, correct implementation of an LRU cache with TTL support, including thread safety, eviction callbacks, and statistics. It covers all required features and includes clear, well-structured code with example usage and basic tests."
    },
    {
      "judge_key": "granite_40",
      "judge_name": "Granite 4.0 Micro",
      "respondent_key": "gemma3_27b",
      "respondent_name": "Gemma 3 27B",
      "weighted_score": 8.8,
      "brief_justification": "The implementation meets all requirements with O(1) operations, thread safety, TTL support, eviction callback, and statistics. Tests cover core functionality, edge cases, and concurrency, demonstrating thoroughness and practical utility."
    },
    {
      "judge_key": "granite_40",
      "judge_name": "Granite 4.0 Micro",
      "respondent_key": "phi4",
      "respondent_name": "Phi-4 14B",
      "weighted_score": 8.8,
      "brief_justification": "The implementation is correct, comprehensive, clear, and useful, with a minor lack of depth in explaining some advanced concurrency nuances."
    },
    {
      "judge_key": "granite_40",
      "judge_name": "Granite 4.0 Micro",
      "respondent_key": "qwen3_8b",
      "respondent_name": "Qwen 3 8B",
      "weighted_score": 8.8,
      "brief_justification": "The response provides a well-structured, detailed implementation with clear explanations, covers all required features, includes comprehensive tests, and maintains high clarity and correctness while being slightly concise on depth."
    },
    {
      "judge_key": "granite_40",
      "judge_name": "Granite 4.0 Micro",
      "respondent_key": "mistral_nemo",
      "respondent_name": "Mistral Nemo 12B",
      "weighted_score": 8.8,
      "brief_justification": "The response provides a well-structured implementation of an LRU cache with TTL support, covering all required features and includes comprehensive tests. It demonstrates good understanding of the problem and offers a practical solution."
    },
    {
      "judge_key": "granite_40",
      "judge_name": "Granite 4.0 Micro",
      "respondent_key": "llama31_8b",
      "respondent_name": "Llama 3.1 8B",
      "weighted_score": 8.8,
      "brief_justification": "The implementation is correct, comprehensive, clear, and useful, with a few minor areas for improvement in terms of depth and clarity of some edge cases."
    },
    {
      "judge_key": "qwen3_8b",
      "judge_name": "Qwen 3 8B",
      "respondent_key": "devstral",
      "respondent_name": "Devstral Small",
      "weighted_score": 8.3,
      "brief_justification": "The implementation correctly handles LRU, TTL, thread safety, and stats. However, the test coverage is minimal and lacks comprehensive edge case testing."
    },
    {
      "judge_key": "qwen3_8b",
      "judge_name": "Qwen 3 8B",
      "respondent_key": "gemma3_27b",
      "respondent_name": "Gemma 3 27B",
      "weighted_score": 9.15,
      "brief_justification": "The implementation meets all requirements with thread safety, O(1) operations, and TTL support. Tests are comprehensive, but could include more edge cases like concurrent expiration checks."
    },
    {
      "judge_key": "qwen3_8b",
      "judge_name": "Qwen 3 8B",
      "respondent_key": "llama4_scout",
      "respondent_name": "Llama 4 Scout",
      "weighted_score": 8.5,
      "brief_justification": "The implementation correctly handles LRU, TTL, thread safety, and eviction callbacks. Tests cover core functionality but could include more edge cases. The code is well-structured but lacks deeper optimizations or advanced features."
    },
    {
      "judge_key": "qwen3_8b",
      "judge_name": "Qwen 3 8B",
      "respondent_key": "phi4",
      "respondent_name": "Phi-4 14B",
      "weighted_score": 7.15,
      "brief_justification": "The implementation correctly handles LRU eviction and thread safety but fails to remove expired entries from the cache, leading to incorrect behavior. The tests cover core functionality but lack edge case coverage."
    },
    {
      "judge_key": "qwen3_8b",
      "judge_name": "Qwen 3 8B",
      "respondent_key": "granite_40",
      "respondent_name": "Granite 4.0 Micro",
      "weighted_score": 6.8,
      "brief_justification": "The implementation correctly handles LRU eviction and thread safety but fails to check TTL expiration during get operations, which violates the lazy expiration requirement. Tests are comprehensive but miss TTL validation scenarios."
    },
    {
      "judge_key": "qwen3_8b",
      "judge_name": "Qwen 3 8B",
      "respondent_key": "mistral_nemo",
      "respondent_name": "Mistral Nemo 12B",
      "weighted_score": 3.6,
      "brief_justification": "The implementation lacks proper TTL expiration handling (missing expiration checks in get/put), missing eviction callbacks, and incomplete statistics tracking. The code also has logical errors in how TTL is managed."
    },
    {
      "judge_key": "qwen3_8b",
      "judge_name": "Qwen 3 8B",
      "respondent_key": "llama31_8b",
      "respondent_name": "Llama 3.1 8B",
      "weighted_score": 5.15,
      "brief_justification": "The implementation lacks proper TTL expiration logic - the code stores TTL but doesn't check for expiration when accessing entries. The cache doesn't actually expire entries based on time, which is a critical missing feature."
    },
    {
      "judge_key": "mistral_nemo",
      "judge_name": "Mistral Nemo 12B",
      "respondent_key": "devstral",
      "respondent_name": "Devstral Small",
      "weighted_score": 7.65,
      "brief_justification": "The response provides a correct and efficient implementation of an LRU cache with per-key TTL support, including thread safety and lazy expiration. However, it lacks comprehensive tests and doesn't cover all requirements (e.g., it doesn't mention cache invalidation)."
    },
    {
      "judge_key": "mistral_nemo",
      "judge_name": "Mistral Nemo 12B",
      "respondent_key": "gemma3_27b",
      "respondent_name": "Gemma 3 27B",
      "weighted_score": 8.65,
      "brief_justification": "The implementation correctly handles thread safety, per-key TTL, lazy expiration, and provides O(1) get/put. It also includes a comprehensive test suite. However, it lacks a detailed explanation of the design choices and trade-offs."
    },
    {
      "judge_key": "mistral_nemo",
      "judge_name": "Mistral Nemo 12B",
      "respondent_key": "llama4_scout",
      "respondent_name": "Llama 4 Scout",
      "weighted_score": 8.25,
      "brief_justification": "The implementation is mostly correct and complete, with clear and well-structured code. It provides thorough tests and covers most requirements. However, it lacks some depth in terms of handling edge cases and error scenarios, which is why it's not a perfect score."
    },
    {
      "judge_key": "mistral_nemo",
      "judge_name": "Mistral Nemo 12B",
      "respondent_key": "phi4",
      "respondent_name": "Phi-4 14B",
      "weighted_score": 8.25,
      "brief_justification": "The response provides a well-structured, correct implementation of an LRU cache with per-key TTL support, meeting all specified requirements. It includes comprehensive tests and uses clear, well-structured communication. However, it could delve deeper into explaining the design choices and trade-offs."
    },
    {
      "judge_key": "mistral_nemo",
      "judge_name": "Mistral Nemo 12B",
      "respondent_key": "granite_40",
      "respondent_name": "Granite 4.0 Micro",
      "weighted_score": 8.6,
      "brief_justification": "The response provides a correct and complete implementation of an LRU cache with per-key TTL support, including thread safety and comprehensive tests. The code is well-structured and easy to understand, with good depth in explaining the data structures and mechanisms used. It also includes a useful test suite to validate the implementation."
    },
    {
      "judge_key": "mistral_nemo",
      "judge_name": "Mistral Nemo 12B",
      "respondent_key": "qwen3_8b",
      "respondent_name": "Qwen 3 8B",
      "weighted_score": 8.25,
      "brief_justification": "The response provides a comprehensive implementation of an LRU cache with per-key TTL, covering all required features. It's well-structured and easy to understand, with good use of data structures and thread safety. The tests are thorough and cover various scenarios. The only minor deduction is for depth, as it could include more advanced usage examples or edge cases."
    },
    {
      "judge_key": "mistral_nemo",
      "judge_name": "Mistral Nemo 12B",
      "respondent_key": "llama31_8b",
      "respondent_name": "Llama 3.1 8B",
      "weighted_score": 8.4,
      "brief_justification": "The implementation meets all the specified requirements, with clear and efficient use of Python's built-in data structures. It includes comprehensive tests and demonstrates thread-safety. However, it lacks some advanced features like automatic TTL renewal or TTL-based eviction priority, which is why depth is not scored higher."
    },
    {
      "judge_key": "llama31_8b",
      "judge_name": "Llama 3.1 8B",
      "respondent_key": "devstral",
      "respondent_name": "Devstral Small",
      "weighted_score": 8.6,
      "brief_justification": "The implementation is mostly correct, but it does not handle the case where the cache is full and a new item is put with a TTL that would cause the oldest item to expire before the new item. The example usage demonstrates the basic functionality, but more comprehensive tests are needed to cover edge cases."
    },
    {
      "judge_key": "llama31_8b",
      "judge_name": "Llama 3.1 8B",
      "respondent_key": "gemma3_27b",
      "respondent_name": "Gemma 3 27B",
      "weighted_score": 9.25,
      "brief_justification": "The implementation is correct and complete, with good clarity and depth. The use of an OrderedDict and a lock ensures O(1) get/put operations and thread safety. The lazy expiration and eviction callback features are well-implemented."
    },
    {
      "judge_key": "llama31_8b",
      "judge_name": "Llama 3.1 8B",
      "respondent_key": "llama4_scout",
      "respondent_name": "Llama 4 Scout",
      "weighted_score": 9.6,
      "brief_justification": "The implementation meets all the requirements, and the code is well-structured and easy to understand. The tests cover all the necessary scenarios, and the documentation provides a clear explanation of the code."
    },
    {
      "judge_key": "llama31_8b",
      "judge_name": "Llama 3.1 8B",
      "respondent_key": "phi4",
      "respondent_name": "Phi-4 14B",
      "weighted_score": 9.6,
      "brief_justification": "The implementation meets all the requirements and provides a comprehensive test suite."
    },
    {
      "judge_key": "llama31_8b",
      "judge_name": "Llama 3.1 8B",
      "respondent_key": "granite_40",
      "respondent_name": "Granite 4.0 Micro",
      "weighted_score": 9.6,
      "brief_justification": "Implementation is correct, complete, and useful, with clear and well-structured code, but some minor improvements can be made for clarity."
    },
    {
      "judge_key": "llama31_8b",
      "judge_name": "Llama 3.1 8B",
      "respondent_key": "qwen3_8b",
      "respondent_name": "Qwen 3 8B",
      "weighted_score": 9.6,
      "brief_justification": "The implementation is correct, complete, and well-structured, with a clear and concise codebase. It provides a high-performance, thread-safe LRU cache with per-key TTL support, eviction callbacks, and detailed hit/miss statistics, making it very useful for a wide range of applications."
    },
    {
      "judge_key": "llama31_8b",
      "judge_name": "Llama 3.1 8B",
      "respondent_key": "mistral_nemo",
      "respondent_name": "Mistral Nemo 12B",
      "weighted_score": 8.6,
      "brief_justification": "The implementation is mostly correct and complete, but there are some minor issues with the `evict` method and the `is_expired` method. The code is clear and well-structured, but could benefit from additional comments and documentation. The implementation provides useful functionality, but the `evict` method could be improved to handle the case where the evicted key is the most recently used item."
    }
  ],
  "meta": {
    "source": "The Multivac (app.themultivac.com)",
    "methodology": "10x10 blind peer matrix evaluation",
    "criteria": "correctness, completeness, clarity, depth, usefulness",
    "self_judgments": "excluded from rankings",
    "license": "Open data — cite as: The Multivac (2026)"
  }
}