Experimental browser for the Atmosphere
If this interests you, you can read the full paper here: metr.org/AI_R_D_Evalu... Or blog post here: metr.org/blog/2024-11... Or check out github.com/METR/ai-rd-t... for the benchmark and transcripts.metr.org for transcripts
Nov 25, 2024, 7:42 PM
{ "uri": "at://did:plc:dll3hepzq76nymel5c3yt6nk/app.bsky.feed.post/3lbsbruneic2b", "cid": "bafyreienv4okeuyjp37yr7jul3wackmlly7vdebuu47xswkw7yafdwftmm", "value": { "text": "If this interests you, you can read the full paper here: metr.org/AI_R_D_Evalu...\nOr blog post here: metr.org/blog/2024-11...\nOr check out github.com/METR/ai-rd-t... for the benchmark and transcripts.metr.org for transcripts", "$type": "app.bsky.feed.post", "embed": { "$type": "app.bsky.embed.external", "external": { "uri": "https://metr.org/blog/2024-11-22-evaluating-r-d-capabilities-of-llms/", "thumb": { "$type": "blob", "ref": { "$link": "bafkreicyfuk6jekbyzrqyebvsldmffeyuxspklsoypz3kyiz46i7mxs6ii" }, "mimeType": "image/jpeg", "size": 132418 }, "title": "Evaluating frontier AI R&D capabilities of language model agents against human experts", "description": "We’re releasing RE-Bench, a new benchmark for measuring the performance of humans and frontier model agents on ML research engineering tasks. We also share data from 71 human expert attempts and resul..." } }, "langs": [ "en" ], "reply": { "root": { "cid": "bafyreiewghwpltsxrvzxb4pehqb2a4prnn5wee34pxbfkj3xmdrccvdyau", "uri": "at://did:plc:dll3hepzq76nymel5c3yt6nk/app.bsky.feed.post/3lbsbrpmg3s2b" }, "parent": { "cid": "bafyreidb7wzzau3h7wephuq5xlsl2pucpck252mjxu2ww4ymv2bfyfsppm", "uri": "at://did:plc:dll3hepzq76nymel5c3yt6nk/app.bsky.feed.post/3lbsbrumtv22b" } }, "facets": [ { "index": { "byteEnd": 81, "byteStart": 57 }, "features": [ { "uri": "https://metr.org/AI_R_D_Evaluation_Report.pdf", "$type": "app.bsky.richtext.facet#link" } ] }, { "index": { "byteEnd": 125, "byteStart": 101 }, "features": [ { "uri": "https://metr.org/blog/2024-11-22-evaluating-r-d-capabilities-of-llms/", "$type": "app.bsky.richtext.facet#link" } ] }, { "index": { "byteEnd": 165, "byteStart": 139 }, "features": [ { "uri": "https://github.com/METR/ai-rd-tasks/tree/main", "$type": "app.bsky.richtext.facet#link" } ] }, { "index": { "byteEnd": 208, "byteStart": 188 }, "features": [ { "uri": "https://transcripts.metr.org", "$type": "app.bsky.richtext.facet#link" } ] } ], "createdAt": "2024-11-25T19:42:38.045Z" } }