ATProto Browser

ATProto Browser

Experimental browser for the Atmosphere

Post

If this interests you, you can read the full paper here: metr.org/AI_R_D_Evalu... Or blog post here: metr.org/blog/2024-11... Or check out github.com/METR/ai-rd-t... for the benchmark and transcripts.metr.org for transcripts

Nov 25, 2024, 7:42 PM

Record data

{
  "uri": "at://did:plc:dll3hepzq76nymel5c3yt6nk/app.bsky.feed.post/3lbsbruneic2b",
  "cid": "bafyreienv4okeuyjp37yr7jul3wackmlly7vdebuu47xswkw7yafdwftmm",
  "value": {
    "text": "If this interests you, you can read the full paper here: metr.org/AI_R_D_Evalu...\nOr blog post here: metr.org/blog/2024-11...\nOr check out github.com/METR/ai-rd-t... for the benchmark and transcripts.metr.org for transcripts",
    "$type": "app.bsky.feed.post",
    "embed": {
      "$type": "app.bsky.embed.external",
      "external": {
        "uri": "https://metr.org/blog/2024-11-22-evaluating-r-d-capabilities-of-llms/",
        "thumb": {
          "$type": "blob",
          "ref": {
            "$link": "bafkreicyfuk6jekbyzrqyebvsldmffeyuxspklsoypz3kyiz46i7mxs6ii"
          },
          "mimeType": "image/jpeg",
          "size": 132418
        },
        "title": "Evaluating frontier AI R&D capabilities of language model agents against human experts",
        "description": "We’re releasing RE-Bench, a new benchmark for measuring the performance of humans and frontier model agents on ML research engineering tasks. We also share data from 71 human expert attempts and resul..."
      }
    },
    "langs": [
      "en"
    ],
    "reply": {
      "root": {
        "cid": "bafyreiewghwpltsxrvzxb4pehqb2a4prnn5wee34pxbfkj3xmdrccvdyau",
        "uri": "at://did:plc:dll3hepzq76nymel5c3yt6nk/app.bsky.feed.post/3lbsbrpmg3s2b"
      },
      "parent": {
        "cid": "bafyreidb7wzzau3h7wephuq5xlsl2pucpck252mjxu2ww4ymv2bfyfsppm",
        "uri": "at://did:plc:dll3hepzq76nymel5c3yt6nk/app.bsky.feed.post/3lbsbrumtv22b"
      }
    },
    "facets": [
      {
        "index": {
          "byteEnd": 81,
          "byteStart": 57
        },
        "features": [
          {
            "uri": "https://metr.org/AI_R_D_Evaluation_Report.pdf",
            "$type": "app.bsky.richtext.facet#link"
          }
        ]
      },
      {
        "index": {
          "byteEnd": 125,
          "byteStart": 101
        },
        "features": [
          {
            "uri": "https://metr.org/blog/2024-11-22-evaluating-r-d-capabilities-of-llms/",
            "$type": "app.bsky.richtext.facet#link"
          }
        ]
      },
      {
        "index": {
          "byteEnd": 165,
          "byteStart": 139
        },
        "features": [
          {
            "uri": "https://github.com/METR/ai-rd-tasks/tree/main",
            "$type": "app.bsky.richtext.facet#link"
          }
        ]
      },
      {
        "index": {
          "byteEnd": 208,
          "byteStart": 188
        },
        "features": [
          {
            "uri": "https://transcripts.metr.org",
            "$type": "app.bsky.richtext.facet#link"
          }
        ]
      }
    ],
    "createdAt": "2024-11-25T19:42:38.045Z"
  }
}