ATProto Browser

ATProto Browser

Experimental browser for the Atmosphere

Post

Researchers report Chatbot Arena scores are flawed. An undisclosed policy allows preferred providers disproportionate private testing access, making the leaderboard an unreliable measure of model capability, not reflecting true performance. #MLSky

May 6, 2025, 2:51 PM

Record data

{
  "uri": "at://did:plc:lrzkd5exmxqrblbruvjieofj/app.bsky.feed.post/3loj4zjof7j2w",
  "cid": "bafyreih6f5ytg6otwkz3umkludor6xod5fxyz2udwomjrdj4ujsfm4gmdq",
  "value": {
    "text": "Researchers report Chatbot Arena scores are flawed. An undisclosed policy allows preferred providers disproportionate private testing access, making the leaderboard an unreliable measure of model capability, not reflecting true performance. \n#MLSky",
    "$type": "app.bsky.feed.post",
    "embed": {
      "$type": "app.bsky.embed.external",
      "external": {
        "uri": "https://arxiv.org/abs/2504.20879",
        "thumb": {
          "$type": "blob",
          "ref": {
            "$link": "bafkreiccpl47tvnq5nbhb3czfzkzsquuvbcctgrqyhzujkxjynazefahna"
          },
          "mimeType": "image/png",
          "size": 19545
        },
        "title": "The Leaderboard Illusion",
        "description": "Measuring progress is fundamental to the advancement of any scientific field. As benchmarks play an increasingly central role, they also grow more susceptible to distortion. Chatbot Arena has emerged…"
      }
    },
    "facets": [
      {
        "index": {
          "byteEnd": 248,
          "byteStart": 242
        },
        "features": [
          {
            "tag": "MLSky",
            "$type": "app.bsky.richtext.facet#tag"
          }
        ]
      }
    ],
    "createdAt": "2025-05-06T14:51:04.505Z"
  }
}