pg_dirtyread是PostgreSQL的一个扩展,提供了一种能力,可以读取数据库中已删除但未被vacuum的行。通过绕过MVCC的可见性规则,直接读取表中已被删除但尚未被物理清理的死元组。这些元组通常对正常SQL查询不可见,但因未被执行VACUUM操作而残留在物理存储中。
使用方法
安装扩展:
postgres=# create extension pg_dirtyread ;
CREATE EXTENSION
可通过函数pg_dirtyread读取已删除但未被vacuum的行:
postgres=# insert into t8 values (1,1);
INSERT 0 1
postgres=# select * from t8;
a | b
---+---
1 | 1
(1 row)
-- 删除数据
postgres=# delete from t8 where a = 1;
DELETE 1
-- 数据已被删除
postgres=# select * from t8;
a | b
---+---
(0 rows)
-- 通过pg_dirtyread读取已删除但未被vacuum的行
postgres=# select * from pg_dirtyread('t8') as t(a int, b int);
a | b
---+---
1 | 1
(1 row)
如果执行update操作,也是可以通过pg_dirtyread读取被更新前的行数据值:
postgres=# insert into t8 values (1,0);
INSERT 0 1
postgres=# select * from t8;
a | b
---+---
1 | 0
(1 row)
postgres=# update t8 set b = 1;
UPDATE 1
-- 更新数据后,再次查询,旧行已读取不到
postgres=# select * from t8;
a | b
---+---
1 | 1
(1 row)
-- 通过pg_dirtyread读取已删除但未被vacuum的行
postgres=# select * from pg_dirtyread('t8') as t(a int, b int);
a | b
---+---
1 | 0 -- 旧行
1 | 1 -- 新行
(2 rows)
如果执行vacuum操作,则已删除但未被vacuum的行将无法读取:
postgres=# vacuum t8;
VACUUM
postgres=# select * from pg_dirtyread('t8') as t(a int, b int);
a | b
---+---
1 | 1
(1 row)
对于删除列,只要表未被重写(比如执行vacuum full或者cluster),那么可以通过dropped_N访问第N个被删除的列(N从1开始计数)。PostgreSQL会清除原列的类型信息,因此即使用户在表别名中指定了正确的类型,也只能进行有限的合法性检查,比如类型长度、类型对齐、类型修饰符、传值方式。
postgres=# select * from t8;
a | b
---+---
1 | 0
2 | 0
(2 rows)
-- 删除列b
postgres=# alter table t8 drop column b;
ALTER TABLE
postgres=# select * from t8;
a
---
1
2
(2 rows)
-- 通过pg_dirtyread读取已删除的列
postgres=# select * from pg_dirtyread('t8') t8(a int,dropped_2 int);
a | dropped_2
---+-----------
1 | 0
2 | 0
(2 rows)
系统列,比如xmax和ctid可通过在pg_dirtyread的表别名中包含这些列来读取。一个特殊的dead列用于报告死行(通过HeapTupleIsSurelyDead判断)。dead列在恢复期间不可用,尤其在standby机上不可用。
postgres=# select * from pg_dirtyread('t7') t7(tableoid oid, ctid tid, xmin xid, xmax xid, cmin cid, cmax cid, dead boolean, a int,b int);
tableoid | ctid | xmin | xmax | cmin | cmax | dead | a | b
----------+-------+---------+---------+------+------+------+---+---
32405 | (0,1) | 4028531 | 0 | 0 | 0 | f | 1 | 0
32405 | (0,2) | 4028531 | 4028532 | 0 | 0 | t | 2 | 0 -- 被更新的行,死元组
32405 | (0,3) | 4028532 | 0 | 0 | 0 | f | 2 | 2
(3 rows)
rollback回滚前的数据也可以通过pg_dirtyread读取:
postgres=# truncate table t7;
TRUNCATE TABLE
postgres=# insert into t7 values(1,1),(2,2);
INSERT 0 2
postgres=# select * from t7;
a | b
---+---
1 | 1
2 | 2
(2 rows)
postgres=# begin;
BEGIN
postgres=*# update t7 set b = 100 where a = 2;
UPDATE 1
postgres=*# rollback;
ROLLBACK
-- 回滚后,可以查看到回滚前的数据
postgres=# select * from pg_dirtyread('t7') t7(tableoid oid, ctid tid, xmin xid, xmax xid, cmin cid, cmax cid, dead boolean, a int,b int);
tableoid | ctid | xmin | xmax | cmin | cmax | dead | a | b
----------+-------+---------+---------+------+------+------+---+-----
32405 | (0,1) | 4028536 | 0 | 0 | 0 | f | 1 | 1
32405 | (0,2) | 4028536 | 4028537 | 0 | 0 | f | 2 | 2
32405 | (0,3) | 4028537 | 0 | 0 | 0 | f | 2 | 100 -- 回滚前的数据
(3 rows)
实现原理
我们看一下其最关键的函数pg_dirtyread的实现原理。对要读取的表,进行顺序扫描,获取所有的元组,对元组的可见性判断中一直返回true,
Datum pg_dirtyread(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
pg_dirtyread_ctx *usr_ctx;
HeapTuple tuplein;
if (SRF_IS_FIRSTCALL())
{
MemoryContext oldcontext;
Oid relid;
TupleDesc tupdesc;
if (!superuser())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
errmsg("must be superuser to use pg_dirtyread")));
relid = PG_GETARG_OID(0);
if (!OidIsValid(relid))
elog(ERROR, "invalid relation oid \"%d\"", relid);
funcctx = SRF_FIRSTCALL_INIT();
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
usr_ctx = (pg_dirtyread_ctx *) palloc(sizeof(pg_dirtyread_ctx));
usr_ctx->rel = table_open(relid, AccessShareLock); // 打开表
usr_ctx->reltupdesc = RelationGetDescr(usr_ctx->rel); // 获取表描述符
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("function returning record called in context "
"that cannot accept type record")));
funcctx->tuple_desc = BlessTupleDesc(tupdesc); // 得到表别名完整的tuple描述符
usr_ctx->map = dirtyread_convert_tuples_by_name(usr_ctx->reltupdesc,
funcctx->tuple_desc, "Error converting tuple descriptors!");
// 构造TableScanDesc,准备扫描,使用SnapshotAny,读取所有版本的数据,包括已删除的行
usr_ctx->scan = heap_beginscan(usr_ctx->rel, SnapshotAny, 0, NULL, NULL, SO_TYPE_SEQSCAN);
usr_ctx->oldest_xmin = GlobalVisTestFor(usr_ctx->rel);
funcctx->user_fctx = (void *) usr_ctx;
MemoryContextSwitchTo(oldcontext);
}
funcctx = SRF_PERCALL_SETUP();
usr_ctx = (pg_dirtyread_ctx *) funcctx->user_fctx;
// 不断获取下一行数据,对每一行禁止转换,直到扫描结束
if ((tuplein = heap_getnext(usr_ctx->scan, ForwardScanDirection)) != NULL)
{
if (usr_ctx->map != NULL)
{
tuplein = dirtyread_do_convert_tuple(tuplein, usr_ctx->map, usr_ctx->oldest_xmin);
SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuplein));
}
else
SRF_RETURN_NEXT(funcctx, heap_copy_tuple_as_datum(tuplein, usr_ctx->reltupdesc));
}
else
{
heap_endscan(usr_ctx->scan);
table_close(usr_ctx->rel, AccessShareLock);
SRF_RETURN_DONE(funcctx);
}
}
如果要将删除的数据(死元组)也读出来,则需要顺序扫描堆表(采用SNAPSHOT_ANY快照类型),对堆表中元组进行可见性判断,在进行可见性判断时,返回全部可见,即可读出旧元组。具体的会调用HeapTupleSatisfiesVisibility->HeapTupleSatisfiesAny,进行可见性判断。
bool HeapTupleSatisfiesVisibility(HeapTuple tup, Snapshot snapshot, Buffer buffer)
{
switch (snapshot->snapshot_type)
{
case SNAPSHOT_MVCC:
return HeapTupleSatisfiesMVCC(tup, snapshot, buffer);
break;
case SNAPSHOT_SELF:
return HeapTupleSatisfiesSelf(tup, snapshot, buffer);
break;
// 所有元组都可见
case SNAPSHOT_ANY:
return HeapTupleSatisfiesAny(tup, snapshot, buffer);
break;
case SNAPSHOT_TOAST:
return HeapTupleSatisfiesToast(tup, snapshot, buffer);
break;
case SNAPSHOT_DIRTY:
return HeapTupleSatisfiesDirty(tup, snapshot, buffer);
break;
case SNAPSHOT_HISTORIC_MVCC:
return HeapTupleSatisfiesHistoricMVCC(tup, snapshot, buffer);
break;
case SNAPSHOT_NON_VACUUMABLE:
return HeapTupleSatisfiesNonVacuumable(tup, snapshot, buffer);
break;
}
return false; /* keep compiler quiet */
}
/*
* HeapTupleSatisfiesAny
* Dummy "satisfies" routine: any tuple satisfies SnapshotAny.
*/
static bool HeapTupleSatisfiesAny(HeapTuple htup, Snapshot snapshot, Buffer buffer)
{
return true;
}
元组可见性判断调用栈:
HeapTupleSatisfiesAny(HeapTuple htup, Snapshot snapshot, Buffer buffer) heapam_visibility.c:341)
HeapTupleSatisfiesVisibility(HeapTuple tup, Snapshot snapshot, Buffer buffer) heapam_visibility.c:1777)
heapgettup(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key) heapam.c:720)
heap_getnext(TableScanDesc sscan, ScanDirection direction) heapam.c:1384)
pg_dirtyread.so!pg_dirtyread(FunctionCallInfo fcinfo) (pg_dirtyread.c:141)
能读取死元组的前提是没有被vacuum清理,或者没有被cluster,也就是原有的元组数据必须还存在在堆表页面中,如果已经被清理掉,那么无法进行读取了。
需要注意的是,pg_dirtyread是行级别的闪回查询,只能恢复update/delete/drop column/rollback这些MVCC机制的数据。
对于drop table等则无法进行闪回查询。
|