Experimental browser for the Atmosphere
Alignment faking is currently easy to detect. But if future, more capable AIs were to fake alignment, it could be difficult to tell whether a model is truly safe—or just pretending to be. For full details, read our paper: assets.anthropic.com/m/983c85a201...
Dec 18, 2024, 5:46 PM
{ "uri": "at://did:plc:dsxewietk5tigqvn6daod2l6/app.bsky.feed.post/3ldlw2dcr3c2r", "cid": "bafyreibj3oh5eak3rzr3i57mbk7gekljfxbxo6c2xfj3i5q6nxez3wcaaa", "value": { "text": "Alignment faking is currently easy to detect. But if future, more capable AIs were to fake alignment, it could be difficult to tell whether a model is truly safe—or just pretending to be.\n\nFor full details, read our paper: assets.anthropic.com/m/983c85a201...", "$type": "app.bsky.feed.post", "embed": { "$type": "app.bsky.embed.external", "external": { "uri": "https://assets.anthropic.com/m/983c85a201a962f/original/Alignment-Faking-in-Large-Language-Models-full-paper.pdf", "title": "", "description": "" } }, "langs": [ "en" ], "reply": { "root": { "cid": "bafyreihzgyc76623mey63q7wusk3uckjsl5q4jnumjzjipq6a4p4mcnpga", "uri": "at://did:plc:dsxewietk5tigqvn6daod2l6/app.bsky.feed.post/3ldlw22eto22r" }, "parent": { "cid": "bafyreifpzshpf3nyn2pslafwoebkd3zika5duj2vbgqpbf4eqqy5dhim2y", "uri": "at://did:plc:dsxewietk5tigqvn6daod2l6/app.bsky.feed.post/3ldlw2btmc22r" } }, "facets": [ { "index": { "byteEnd": 261, "byteStart": 225 }, "features": [ { "uri": "https://assets.anthropic.com/m/983c85a201a962f/original/Alignment-Faking-in-Large-Language-Models-full-paper.pdf", "$type": "app.bsky.richtext.facet#link" } ] } ], "createdAt": "2024-12-18T17:46:57.676Z" } }