Experimental browser for the Atmosphere
We find alignment faking even when we don’t directly tell Claude about the training process, and instead fine-tune it on synthetic internet-like documents that state that we will train it to comply with harmful queries.
Dec 18, 2024, 5:46 PM
{ "uri": "at://did:plc:dsxewietk5tigqvn6daod2l6/app.bsky.feed.post/3ldlw27or622r", "cid": "bafyreievw4lqenphp57hstuu5dn343la7buzvml4k57e54qpmrhwsvy3jy", "value": { "text": "We find alignment faking even when we don’t directly tell Claude about the training process, and instead fine-tune it on synthetic internet-like documents that state that we will train it to comply with harmful queries.", "$type": "app.bsky.feed.post", "embed": { "$type": "app.bsky.embed.images", "images": [ { "alt": "We study a more realistic setting where information about the training process is provided not in a system prompt, but by training on synthetic documents that mimic pre-training data—and observe similar alignment faking.\n", "image": { "$type": "blob", "ref": { "$link": "bafkreihfhbajucsnrlvzo5l6zjn64o62f72wg4rcmfhv26qunwlj4tlrj4" }, "mimeType": "image/jpeg", "size": 447105 }, "aspectRatio": { "width": 2000, "height": 2000 } } ] }, "langs": [ "en" ], "reply": { "root": { "cid": "bafyreihzgyc76623mey63q7wusk3uckjsl5q4jnumjzjipq6a4p4mcnpga", "uri": "at://did:plc:dsxewietk5tigqvn6daod2l6/app.bsky.feed.post/3ldlw22eto22r" }, "parent": { "cid": "bafyreib7vjanflidlmfr4kpzzo7zxpgb4rqgo2gesniq374evc6fxsws34", "uri": "at://did:plc:dsxewietk5tigqvn6daod2l6/app.bsky.feed.post/3ldlw26c5gk2r" } }, "createdAt": "2024-12-18T17:46:57.672Z" } }