Experimental browser for the Atmosphere
ITP implements the principle of pessimism in the face of uncertainty through a rejection sampling scheme, and can be viewed as a purely inference-time counterpart to our χPO work here: arxiv.org/abs/2407.13399. 7/11
May 3, 2025, 5:40 PM
{ "uri": "at://did:plc:x2a3inabvfsn4wntrlbbndrv/app.bsky.feed.post/3lobv4g2sks2d", "cid": "bafyreiguqwkqbexp5hkayqbacusyerepe7fzs5u7oy7qidc2mt37t5cscy", "value": { "text": "ITP implements the principle of pessimism in the face of uncertainty through a rejection sampling scheme, and can be viewed as a purely inference-time counterpart to our χPO work here: arxiv.org/abs/2407.13399.\n\n7/11", "$type": "app.bsky.feed.post", "embed": { "$type": "app.bsky.embed.external", "external": { "uri": "https://arxiv.org/abs/2407.13399", "thumb": { "$type": "blob", "ref": { "$link": "bafkreieihf6p54kf7b4suuiy5kov6yxwlvjtrvyvynqbgqzn3jfcqy2b6q" }, "mimeType": "image/jpeg", "size": 146959 }, "title": "Correcting the Mythos of KL-Regularization: Direct Alignment without Overoptimization via Chi-Squared Preference Optimization", "description": "Language model alignment methods such as reinforcement learning from human feedback (RLHF) have led to impressive advances in language model capabilities, but are limited by a widely observed phenomen..." } }, "langs": [ "en" ], "reply": { "root": { "cid": "bafyreih6mvnxmoz4bgad7vcbv2fl63qhwwggtht5lpckbzj7yexbnz2qie", "uri": "at://did:plc:x2a3inabvfsn4wntrlbbndrv/app.bsky.feed.post/3lobv4byuec2d" }, "parent": { "cid": "bafyreic4egovi7uuzdkg3bifql6kkbujfzq5wbun2ukxzkhkw3fki2bbau", "uri": "at://did:plc:x2a3inabvfsn4wntrlbbndrv/app.bsky.feed.post/3lobv4g2qmc2d" } }, "facets": [ { "index": { "byteEnd": 210, "byteStart": 186 }, "features": [ { "uri": "https://arxiv.org/abs/2407.13399", "$type": "app.bsky.richtext.facet#link" } ] } ], "createdAt": "2025-05-03T17:40:49.565Z" } }