ATProto Browser

ATProto Browser

Experimental browser for the Atmosphere

Post

Alignment faking is currently easy to detect. But if future, more capable AIs were to fake alignment, it could be difficult to tell whether a model is truly safe—or just pretending to be. For full details, read our paper: assets.anthropic.com/m/983c85a201...

Dec 18, 2024, 5:46 PM

Record data

{
  "uri": "at://did:plc:dsxewietk5tigqvn6daod2l6/app.bsky.feed.post/3ldlw2dcr3c2r",
  "cid": "bafyreibj3oh5eak3rzr3i57mbk7gekljfxbxo6c2xfj3i5q6nxez3wcaaa",
  "value": {
    "text": "Alignment faking is currently easy to detect. But if future, more capable AIs were to fake alignment, it could be difficult to tell whether a model is truly safe—or just pretending to be.\n\nFor full details, read our paper: assets.anthropic.com/m/983c85a201...",
    "$type": "app.bsky.feed.post",
    "embed": {
      "$type": "app.bsky.embed.external",
      "external": {
        "uri": "https://assets.anthropic.com/m/983c85a201a962f/original/Alignment-Faking-in-Large-Language-Models-full-paper.pdf",
        "title": "",
        "description": ""
      }
    },
    "langs": [
      "en"
    ],
    "reply": {
      "root": {
        "cid": "bafyreihzgyc76623mey63q7wusk3uckjsl5q4jnumjzjipq6a4p4mcnpga",
        "uri": "at://did:plc:dsxewietk5tigqvn6daod2l6/app.bsky.feed.post/3ldlw22eto22r"
      },
      "parent": {
        "cid": "bafyreifpzshpf3nyn2pslafwoebkd3zika5duj2vbgqpbf4eqqy5dhim2y",
        "uri": "at://did:plc:dsxewietk5tigqvn6daod2l6/app.bsky.feed.post/3ldlw2btmc22r"
      }
    },
    "facets": [
      {
        "index": {
          "byteEnd": 261,
          "byteStart": 225
        },
        "features": [
          {
            "uri": "https://assets.anthropic.com/m/983c85a201a962f/original/Alignment-Faking-in-Large-Language-Models-full-paper.pdf",
            "$type": "app.bsky.richtext.facet#link"
          }
        ]
      }
    ],
    "createdAt": "2024-12-18T17:46:57.676Z"
  }
}