PostgreSQL数据库扩展——pg_dirtyread

mahan · 发表于 2025-9-21 19:45:48

pg_dirtyread是PostgreSQL的一个扩展，提供了一种能力，可以读取数据库中已删除但未被vacuum的行。通过绕过MVCC的可见性规则，直接读取表中已被删除但尚未被物理清理的死元组。这些元组通常对正常SQL查询不可见，但因未被执行VACUUM操作而残留在物理存储中。

使用方法
安装扩展：

postgres=# create extension pg_dirtyread ;
CREATE EXTENSION
可通过函数pg_dirtyread读取已删除但未被vacuum的行：

postgres=# insert into t8 values (1,1);
INSERT 0 1
postgres=# select * from t8;
a | b
---+---
1 | 1
(1 row)
-- 删除数据
postgres=# delete from t8 where a = 1;
DELETE 1
-- 数据已被删除
postgres=# select * from t8;
a | b
---+---
(0 rows)
-- 通过pg_dirtyread读取已删除但未被vacuum的行
postgres=# select * from pg_dirtyread('t8') as t(a int, b int);
a | b
---+---
1 | 1
(1 row)
如果执行update操作，也是可以通过pg_dirtyread读取被更新前的行数据值：

postgres=# insert into t8 values (1,0);
INSERT 0 1
postgres=# select * from t8;
a | b
---+---
1 | 0
(1 row)

postgres=# update t8 set b = 1;
UPDATE 1
-- 更新数据后，再次查询，旧行已读取不到
postgres=# select * from t8;
a | b
---+---
1 | 1
(1 row)
-- 通过pg_dirtyread读取已删除但未被vacuum的行
postgres=# select * from pg_dirtyread('t8') as t(a int, b int);
a | b
---+---
1 | 0    -- 旧行
1 | 1    -- 新行
(2 rows)
如果执行vacuum操作，则已删除但未被vacuum的行将无法读取：

postgres=# vacuum t8;
VACUUM
postgres=# select * from pg_dirtyread('t8') as t(a int, b int);
a | b
---+---
1 | 1
(1 row)
对于删除列，只要表未被重写（比如执行vacuum full或者cluster），那么可以通过dropped_N访问第N个被删除的列（N从1开始计数）。PostgreSQL会清除原列的类型信息，因此即使用户在表别名中指定了正确的类型，也只能进行有限的合法性检查，比如类型长度、类型对齐、类型修饰符、传值方式。

postgres=# select * from t8;
a | b
---+---
1 | 0
2 | 0
(2 rows)
-- 删除列b
postgres=# alter table t8 drop column b;
ALTER TABLE
postgres=# select * from t8;
a
---
1
2
(2 rows)
-- 通过pg_dirtyread读取已删除的列
postgres=# select * from pg_dirtyread('t8') t8(a int,dropped_2 int);
a | dropped_2
---+-----------
1 |       0
2 |       0
(2 rows)
系统列，比如xmax和ctid可通过在pg_dirtyread的表别名中包含这些列来读取。一个特殊的dead列用于报告死行（通过HeapTupleIsSurelyDead判断）。dead列在恢复期间不可用，尤其在standby机上不可用。

postgres=# select * from pg_dirtyread('t7') t7(tableoid oid, ctid tid, xmin xid, xmax xid, cmin cid, cmax cid, dead boolean, a int,b int);
tableoid | ctid  |  xmin |  xmax | cmin | cmax | dead | a | b
----------+-------+---------+---------+------+------+------+---+---
32405 | (0,1) | 4028531 |    0 | 0 | 0 | f | 1 | 0
32405 | (0,2) | 4028531 | 4028532 | 0 | 0 | t | 2 | 0    -- 被更新的行，死元组
32405 | (0,3) | 4028532 |    0 | 0 | 0 | f | 2 | 2
(3 rows)
rollback回滚前的数据也可以通过pg_dirtyread读取：

postgres=# truncate table t7;
TRUNCATE TABLE
postgres=# insert into t7 values(1,1),(2,2);
INSERT 0 2
postgres=# select * from t7;
a | b
---+---
1 | 1
2 | 2
(2 rows)

postgres=# begin;
BEGIN
postgres=*# update t7 set b = 100 where a = 2;
UPDATE 1
postgres=*# rollback;
ROLLBACK
-- 回滚后，可以查看到回滚前的数据
postgres=# select * from pg_dirtyread('t7') t7(tableoid oid, ctid tid, xmin xid, xmax xid, cmin cid, cmax cid, dead boolean, a int,b int);
tableoid | ctid  |  xmin |  xmax | cmin | cmax | dead | a |  b
----------+-------+---------+---------+------+------+------+---+-----
32405 | (0,1) | 4028536 |    0 | 0 | 0 | f | 1 | 1
32405 | (0,2) | 4028536 | 4028537 | 0 | 0 | f | 2 | 2
32405 | (0,3) | 4028537 |    0 | 0 | 0 | f | 2 | 100 -- 回滚前的数据
(3 rows)

实现原理
我们看一下其最关键的函数pg_dirtyread的实现原理。对要读取的表，进行顺序扫描，获取所有的元组，对元组的可见性判断中一直返回true，

Datum pg_dirtyread(PG_FUNCTION_ARGS)
{
FuncCallContext    *funcctx;
pg_dirtyread_ctx *usr_ctx;
HeapTuple          tuplein;

if (SRF_IS_FIRSTCALL())
{
      MemoryContext    oldcontext;
      Oid                relid;
      TupleDesc          tupdesc;

      if (!superuser())
         ereport(ERROR,
                  (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
                  errmsg("must be superuser to use pg_dirtyread")));

      relid = PG_GETARG_OID(0);
      if (!OidIsValid(relid))
         elog(ERROR, "invalid relation oid \"%d\"", relid);

      funcctx = SRF_FIRSTCALL_INIT();
      oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
      usr_ctx = (pg_dirtyread_ctx *) palloc(sizeof(pg_dirtyread_ctx));
      usr_ctx->rel = table_open(relid, AccessShareLock);  // 打开表
      usr_ctx->reltupdesc = RelationGetDescr(usr_ctx->rel); // 获取表描述符
      if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
         ereport(ERROR,
                  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                  errmsg("function returning record called in context "
                     "that cannot accept type record")));
      funcctx->tuple_desc = BlessTupleDesc(tupdesc); // 得到表别名完整的tuple描述符
      usr_ctx->map = dirtyread_convert_tuples_by_name(usr_ctx->reltupdesc,
            funcctx->tuple_desc, "Error converting tuple descriptors!");
      // 构造TableScanDesc，准备扫描，使用SnapshotAny，读取所有版本的数据，包括已删除的行
      usr_ctx->scan = heap_beginscan(usr_ctx->rel, SnapshotAny, 0, NULL, NULL, SO_TYPE_SEQSCAN);
      usr_ctx->oldest_xmin = GlobalVisTestFor(usr_ctx->rel);
      funcctx->user_fctx = (void *) usr_ctx;
      MemoryContextSwitchTo(oldcontext);
}

funcctx = SRF_PERCALL_SETUP();
usr_ctx = (pg_dirtyread_ctx *) funcctx->user_fctx;
// 不断获取下一行数据，对每一行禁止转换，直到扫描结束
if ((tuplein = heap_getnext(usr_ctx->scan, ForwardScanDirection)) != NULL)
{
      if (usr_ctx->map != NULL)
      {
         tuplein = dirtyread_do_convert_tuple(tuplein, usr_ctx->map, usr_ctx->oldest_xmin);
         SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuplein));
      }
      else
         SRF_RETURN_NEXT(funcctx, heap_copy_tuple_as_datum(tuplein, usr_ctx->reltupdesc));
}
else
{
      heap_endscan(usr_ctx->scan);
      table_close(usr_ctx->rel, AccessShareLock);
      SRF_RETURN_DONE(funcctx);
}
}
如果要将删除的数据（死元组）也读出来，则需要顺序扫描堆表（采用SNAPSHOT_ANY快照类型），对堆表中元组进行可见性判断，在进行可见性判断时，返回全部可见，即可读出旧元组。具体的会调用HeapTupleSatisfiesVisibility->HeapTupleSatisfiesAny，进行可见性判断。

bool HeapTupleSatisfiesVisibility(HeapTuple tup, Snapshot snapshot, Buffer buffer)
{
switch (snapshot->snapshot_type)
{
case SNAPSHOT_MVCC:
return HeapTupleSatisfiesMVCC(tup, snapshot, buffer);
break;
case SNAPSHOT_SELF:
return HeapTupleSatisfiesSelf(tup, snapshot, buffer);
break;
      // 所有元组都可见
case SNAPSHOT_ANY:
return HeapTupleSatisfiesAny(tup, snapshot, buffer);
break;
case SNAPSHOT_TOAST:
return HeapTupleSatisfiesToast(tup, snapshot, buffer);
break;
case SNAPSHOT_DIRTY:
return HeapTupleSatisfiesDirty(tup, snapshot, buffer);
break;
case SNAPSHOT_HISTORIC_MVCC:
return HeapTupleSatisfiesHistoricMVCC(tup, snapshot, buffer);
break;
case SNAPSHOT_NON_VACUUMABLE:
return HeapTupleSatisfiesNonVacuumable(tup, snapshot, buffer);
break;
}

return false; /* keep compiler quiet */
}

/*
* HeapTupleSatisfiesAny
* Dummy "satisfies" routine: any tuple satisfies SnapshotAny.
*/
static bool HeapTupleSatisfiesAny(HeapTuple htup, Snapshot snapshot, Buffer buffer)
{
return true;
}
元组可见性判断调用栈：

HeapTupleSatisfiesAny(HeapTuple htup, Snapshot snapshot, Buffer buffer) heapam_visibility.c:341)
HeapTupleSatisfiesVisibility(HeapTuple tup, Snapshot snapshot, Buffer buffer) heapam_visibility.c:1777)
heapgettup(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key) heapam.c:720)
heap_getnext(TableScanDesc sscan, ScanDirection direction) heapam.c:1384)
pg_dirtyread.so!pg_dirtyread(FunctionCallInfo fcinfo) (pg_dirtyread.c:141)
能读取死元组的前提是没有被vacuum清理，或者没有被cluster，也就是原有的元组数据必须还存在在堆表页面中，如果已经被清理掉，那么无法进行读取了。

需要注意的是，pg_dirtyread是行级别的闪回查询，只能恢复update/delete/drop column/rollback这些MVCC机制的数据。
对于drop table等则无法进行闪回查询。

帐号		自动登录	找回密码
密码			注册

PostgreSQL数据库扩展——pg_dirtyread

浏览过的版块