greenplumn smgr 源码
greenplumn smgr 代码
文件路径:/src/backend/storage/smgr/smgr.c
/*-------------------------------------------------------------------------
*
* smgr.c
* public interface routines to storage manager switch.
*
* All file system operations in POSTGRES dispatch through these
* routines.
*
* Portions Copyright (c) 2006-2008, Greenplum inc
* Portions Copyright (c) 2012-Present VMware, Inc. or its affiliates.
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/storage/smgr/smgr.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/aomd.h"
#include "access/xact.h"
#include "access/xlogutils.h"
#include "catalog/catalog.h"
#include "catalog/indexing.h"
#include "commands/tablespace.h"
#include "postmaster/postmaster.h"
#include "lib/ilist.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/md.h"
#include "storage/smgr.h"
#include "utils/faultinjector.h"
#include "utils/hsearch.h"
#include "utils/inval.h"
/*
* Hook for plugins to collect statistics from storage functions
* For example, disk quota extension will use these hooks to
* detect active tables.
*/
file_create_hook_type file_create_hook = NULL;
file_extend_hook_type file_extend_hook = NULL;
file_truncate_hook_type file_truncate_hook = NULL;
file_unlink_hook_type file_unlink_hook = NULL;
static const f_smgr smgrsw[] = {
/* magnetic disk */
{
.smgr_init = mdinit,
.smgr_shutdown = NULL,
.smgr_close = mdclose,
.smgr_create = mdcreate,
.smgr_exists = mdexists,
.smgr_unlink = mdunlink,
.smgr_extend = mdextend,
.smgr_prefetch = mdprefetch,
.smgr_read = mdread,
.smgr_write = mdwrite,
.smgr_writeback = mdwriteback,
.smgr_nblocks = mdnblocks,
.smgr_truncate = mdtruncate,
.smgr_immedsync = mdimmedsync,
},
/*
* Relation files that are different from heap, characterised by:
* 1. variable blocksize
* 2. block numbers are not consecutive
* 3. shared buffers are not used
* Append-optimized relation files currently fall in this category.
*/
{
.smgr_init = mdinit,
.smgr_shutdown = NULL,
.smgr_close = mdclose,
.smgr_create = mdcreate,
.smgr_exists = mdexists,
.smgr_unlink = mdunlink_ao,
.smgr_extend = mdextend,
.smgr_prefetch = mdprefetch,
.smgr_read = mdread,
.smgr_write = mdwrite,
.smgr_writeback = mdwriteback,
.smgr_nblocks = mdnblocks,
.smgr_truncate = mdtruncate,
.smgr_immedsync = mdimmedsync,
}
};
static const int NSmgr = lengthof(smgrsw);
/*
* Each backend has a hashtable that stores all extant SMgrRelation objects.
* In addition, "unowned" SMgrRelation objects are chained together in a list.
*/
static HTAB *SMgrRelationHash = NULL;
static dlist_head unowned_relns;
/* local function prototypes */
static void smgrshutdown(int code, Datum arg);
/*
* smgrinit(), smgrshutdown() -- Initialize or shut down storage
* managers.
*
* Note: smgrinit is called during backend startup (normal or standalone
* case), *not* during postmaster start. Therefore, any resources created
* here or destroyed in smgrshutdown are backend-local.
*/
void
smgr_init_standard(void)
{
int i;
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_init)
smgrsw[i].smgr_init();
}
mdinit();
}
void
smgrinit(void)
{
if (smgr_init_hook)
(*smgr_init_hook)();
smgr_init_standard();
/* register the shutdown proc */
on_proc_exit(smgrshutdown, 0);
}
/*
* on_proc_exit hook for smgr cleanup during backend shutdown
*/
void smgr_shutdown_standard() {
int i;
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_shutdown)
smgrsw[i].smgr_shutdown();
}
}
static void
smgrshutdown(int code, Datum arg)
{
if (smgr_shutdown_hook)
(*smgr_shutdown_hook)();
smgr_shutdown_standard();
}
/* Hooks for plugins to get control in smgr */
smgr_hook_type smgr_hook = NULL;
smgr_init_hook_type smgr_init_hook = NULL;
smgr_shutdown_hook_type smgr_shutdown_hook = NULL;
const f_smgr *
smgr_standard(BackendId backend, RelFileNode rnode, SMgrImpl which)
{
// for md.c
return &smgrsw[which];
}
const f_smgr *
smgr(BackendId backend, RelFileNode rnode, SMgrImpl which)
{
const f_smgr *result;
if (smgr_hook)
{
result = (*smgr_hook)(backend, rnode, which);
}
else
result = smgr_standard(backend, rnode, which);
return result;
}
/*
* smgropen() -- Return an SMgrRelation object, creating it if need be.
*
* This does not attempt to actually open the underlying file.
*/
SMgrRelation
smgropen(RelFileNode rnode, BackendId backend, SMgrImpl which)
{
RelFileNodeBackend brnode;
SMgrRelation reln;
bool found;
/* GPDB: don't support MyBackendId as a possible backend. */
Assert(backend == InvalidBackendId || backend == TempRelBackendId);
if (SMgrRelationHash == NULL)
{
/* First time through: initialize the hash table */
HASHCTL ctl;
MemSet(&ctl, 0, sizeof(ctl));
ctl.keysize = sizeof(RelFileNodeBackend);
ctl.entrysize = sizeof(SMgrRelationData);
SMgrRelationHash = hash_create("smgr relation table", 400,
&ctl, HASH_ELEM | HASH_BLOBS);
dlist_init(&unowned_relns);
}
/* Look up or create an entry */
brnode.node = rnode;
brnode.backend = backend;
reln = (SMgrRelation) hash_search(SMgrRelationHash,
(void *) &brnode,
HASH_ENTER, &found);
/* Initialize it if not present before */
if (!found)
{
int forknum;
/* hash_search already filled in the lookup key */
reln->smgr_owner = NULL;
reln->smgr_targblock = InvalidBlockNumber;
reln->smgr_fsm_nblocks = InvalidBlockNumber;
reln->smgr_vm_nblocks = InvalidBlockNumber;
reln->smgr_which = which;
reln->storageManager = smgr(backend, rnode, which);
/* mark it not open */
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
reln->md_num_open_segs[forknum] = 0;
/* it has no owner yet */
dlist_push_tail(&unowned_relns, &reln->node);
}
return reln;
}
/*
* smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object
*
* There can be only one owner at a time; this is sufficient since currently
* the only such owners exist in the relcache.
*/
void
smgrsetowner(SMgrRelation *owner, SMgrRelation reln)
{
/* We don't support "disowning" an SMgrRelation here, use smgrclearowner */
Assert(owner != NULL);
/*
* First, unhook any old owner. (Normally there shouldn't be any, but it
* seems possible that this can happen during swap_relation_files()
* depending on the order of processing. It's ok to close the old
* relcache entry early in that case.)
*
* If there isn't an old owner, then the reln should be in the unowned
* list, and we need to remove it.
*/
if (reln->smgr_owner)
*(reln->smgr_owner) = NULL;
else
dlist_delete(&reln->node);
/* Now establish the ownership relationship. */
reln->smgr_owner = owner;
*owner = reln;
}
/*
* smgrclearowner() -- Remove long-lived reference to an SMgrRelation object
* if one exists
*/
void
smgrclearowner(SMgrRelation *owner, SMgrRelation reln)
{
/* Do nothing if the SMgrRelation object is not owned by the owner */
if (reln->smgr_owner != owner)
return;
/* unset the owner's reference */
*owner = NULL;
/* unset our reference to the owner */
reln->smgr_owner = NULL;
/* add to list of unowned relations */
dlist_push_tail(&unowned_relns, &reln->node);
}
/*
* smgrexists() -- Does the underlying file for a fork exist?
*/
bool
smgrexists(SMgrRelation reln, ForkNumber forknum)
{
return (*reln->storageManager).smgr_exists(reln, forknum);
}
/*
* smgrclose() -- Close and delete an SMgrRelation object.
*/
void
smgrclose(SMgrRelation reln)
{
SMgrRelation *owner;
ForkNumber forknum;
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
(*reln->storageManager).smgr_close(reln, forknum);
owner = reln->smgr_owner;
if (!owner)
dlist_delete(&reln->node);
if (hash_search(SMgrRelationHash,
(void *) &(reln->smgr_rnode),
HASH_REMOVE, NULL) == NULL)
elog(ERROR, "SMgrRelation hashtable corrupted");
/*
* Unhook the owner pointer, if any. We do this last since in the remote
* possibility of failure above, the SMgrRelation object will still exist.
*/
if (owner)
*owner = NULL;
}
/*
* smgrcloseall() -- Close all existing SMgrRelation objects.
*/
void
smgrcloseall(void)
{
HASH_SEQ_STATUS status;
SMgrRelation reln;
/* Nothing to do if hashtable not set up */
if (SMgrRelationHash == NULL)
return;
hash_seq_init(&status, SMgrRelationHash);
while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
smgrclose(reln);
}
/*
* smgrclosenode() -- Close SMgrRelation object for given RelFileNode,
* if one exists.
*
* This has the same effects as smgrclose(smgropen(rnode)), but it avoids
* uselessly creating a hashtable entry only to drop it again when no
* such entry exists already.
*/
void
smgrclosenode(RelFileNodeBackend rnode)
{
SMgrRelation reln;
/* Nothing to do if hashtable not set up */
if (SMgrRelationHash == NULL)
return;
reln = (SMgrRelation) hash_search(SMgrRelationHash,
(void *) &rnode,
HASH_FIND, NULL);
if (reln != NULL)
smgrclose(reln);
}
/*
* smgrcreate() -- Create a new relation.
*
* Given an already-created (but presumably unused) SMgrRelation,
* cause the underlying disk file or other storage for the fork
* to be created.
*
* If isRedo is true, it is okay for the underlying file to exist
* already because we are in a WAL replay sequence.
*/
void
smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
{
/*
* Exit quickly in WAL replay mode if we've already opened the file. If
* it's open, it surely must exist.
*/
if (isRedo && reln->md_num_open_segs[forknum] > 0)
return;
/*
* We may be using the target table space for the first time in this
* database, so create a per-database subdirectory if needed.
*
* XXX this is a fairly ugly violation of module layering, but this seems
* to be the best place to put the check. Maybe TablespaceCreateDbspace
* should be here and not in commands/tablespace.c? But that would imply
* importing a lot of stuff that smgr.c oughtn't know, either.
*/
TablespaceCreateDbspace(reln->smgr_rnode.node.spcNode,
reln->smgr_rnode.node.dbNode,
isRedo);
(*reln->storageManager).smgr_create(reln, forknum, isRedo);
if (file_create_hook)
(*file_create_hook)(reln->smgr_rnode);
}
/*
* smgrcreate_ao() -- Create a new AO relation segment.
* Given a RelFileNode, cause the underlying disk file for the
* AO segment to be created.
*
* If isRedo is true, it is okay for the underlying file to exist
* already because we are in a WAL replay sequence.
*/
void
smgrcreate_ao(RelFileNodeBackend rnode, int32 segmentFileNum, bool isRedo)
{
mdcreate_ao(rnode, segmentFileNum, isRedo);
if (file_create_hook)
(*file_create_hook)(rnode);
}
/*
* smgrdounlinkall() -- Immediately unlink all forks of all given relations
*
* All forks of all given relations are removed from the store. This
* should not be used during transactional operations, since it can't be
* undone.
*
* If isRedo is true, it is okay for the underlying file(s) to be gone
* already.
*/
void
smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
{
int i = 0;
RelFileNodeBackend *rnodes;
ForkNumber forknum;
if (nrels == 0)
return;
/*
* create an array which contains all relations to be dropped, and close
* each relation's forks at the smgr level while at it
*/
rnodes = palloc(sizeof(RelFileNodeBackend) * nrels);
for (i = 0; i < nrels; i++)
{
RelFileNodeBackend rnode = rels[i]->smgr_rnode;
rnodes[i] = rnode;
/* Close the forks at smgr level */
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
(*rels[i]->storageManager).smgr_close(rels[i], forknum);
}
/*
* Get rid of any remaining buffers for the relations. bufmgr will just
* drop them without bothering to write the contents.
*/
DropRelFileNodesAllBuffers(rnodes, nrels);
/*
* It'd be nice to tell the stats collector to forget them immediately,
* too. But we can't because we don't know the OIDs.
*/
/*
* Send a shared-inval message to force other backends to close any
* dangling smgr references they may have for these rels. We should do
* this before starting the actual unlinking, in case we fail partway
* through that step. Note that the sinval messages will eventually come
* back to this backend, too, and thereby provide a backstop that we
* closed our own smgr rel.
*/
for (i = 0; i < nrels; i++)
CacheInvalidateSmgr(rnodes[i]);
/*
* Delete the physical file(s).
*
* Note: smgr_unlink must treat deletion failure as a WARNING, not an
* ERROR, because we've already decided to commit or abort the current
* xact.
*/
for (i = 0; i < nrels; i++)
{
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
(*rels[i]->storageManager).smgr_unlink(rnodes[i], forknum, isRedo);
}
if (file_unlink_hook)
for (i = 0; i < nrels; i++)
(*file_unlink_hook)(rnodes[i]);
pfree(rnodes);
}
/*
* smgrdounlinkfork() -- Immediately unlink one fork of a relation.
*
* The specified fork of the relation is removed from the store. This
* should not be used during transactional operations, since it can't be
* undone.
*
* If isRedo is true, it is okay for the underlying file to be gone
* already.
*/
void
smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo)
{
RelFileNodeBackend rnode = reln->smgr_rnode;
/* Close the fork at smgr level */
(*reln->storageManager).smgr_close(reln, forknum);
/*
* Get rid of any remaining buffers for the fork. bufmgr will just drop
* them without bothering to write the contents.
*/
DropRelFileNodeBuffers(rnode, forknum, 0);
/*
* It'd be nice to tell the stats collector to forget it immediately, too.
* But we can't because we don't know the OID (and in cases involving
* relfilenode swaps, it's not always clear which table OID to forget,
* anyway).
*/
/*
* Send a shared-inval message to force other backends to close any
* dangling smgr references they may have for this rel. We should do this
* before starting the actual unlinking, in case we fail partway through
* that step. Note that the sinval message will eventually come back to
* this backend, too, and thereby provide a backstop that we closed our
* own smgr rel.
*/
CacheInvalidateSmgr(rnode);
/*
* Delete the physical file(s).
*
* Note: smgr_unlink must treat deletion failure as a WARNING, not an
* ERROR, because we've already decided to commit or abort the current
* xact.
*/
(*reln->storageManager).smgr_unlink(rnode, forknum, isRedo);
}
/*
* smgrextend() -- Add a new block to a file.
*
* The semantics are nearly the same as smgrwrite(): write at the
* specified position. However, this is to be used for the case of
* extending a relation (i.e., blocknum is at or beyond the current
* EOF). Note that we assume writing a block beyond current EOF
* causes intervening file space to become filled with zeroes.
* failure we clean up by truncating.
*/
void
smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer, bool skipFsync)
{
(*reln->storageManager).smgr_extend(reln, forknum, blocknum,
buffer, skipFsync);
if (file_extend_hook)
(*file_extend_hook)(reln->smgr_rnode);
}
/*
* smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
*/
void
smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
{
(*reln->storageManager).smgr_prefetch(reln, forknum, blocknum);
}
/*
* smgrread() -- read a particular block from a relation into the supplied
* buffer.
*
* This routine is called from the buffer manager in order to
* instantiate pages in the shared buffer cache. All storage managers
* return pages in the format that POSTGRES expects.
*/
void
smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer)
{
(*reln->storageManager).smgr_read(reln, forknum, blocknum, buffer);
}
/*
* smgrwrite() -- Write the supplied buffer out.
*
* This is to be used only for updating already-existing blocks of a
* relation (ie, those before the current EOF). To extend a relation,
* use smgrextend().
*
* This is not a synchronous write -- the block is not necessarily
* on disk at return, only dumped out to the kernel. However,
* provisions will be made to fsync the write before the next checkpoint.
*
* skipFsync indicates that the caller will make other provisions to
* fsync the relation, so we needn't bother. Temporary relations also
* do not require fsync.
*/
void
smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer, bool skipFsync)
{
(*reln->storageManager).smgr_write(reln, forknum, blocknum,
buffer, skipFsync);
}
/*
* smgrwriteback() -- Trigger kernel writeback for the supplied range of
* blocks.
*/
void
smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
BlockNumber nblocks)
{
(*reln->storageManager).smgr_writeback(reln, forknum, blocknum,
nblocks);
}
/*
* smgrnblocks() -- Calculate the number of blocks in the
* supplied relation.
*/
BlockNumber
smgrnblocks(SMgrRelation reln, ForkNumber forknum)
{
return (*reln->storageManager).smgr_nblocks(reln, forknum);
}
/*
* smgrtruncate() -- Truncate supplied relation to the specified number
* of blocks
*
* The truncation is done immediately, so this can't be rolled back.
*/
void
smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
{
/*
* Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
* just drop them without bothering to write the contents.
*/
DropRelFileNodeBuffers(reln->smgr_rnode, forknum, nblocks);
/*
* Send a shared-inval message to force other backends to close any smgr
* references they may have for this rel. This is useful because they
* might have open file pointers to segments that got removed, and/or
* smgr_targblock variables pointing past the new rel end. (The inval
* message will come back to our backend, too, causing a
* probably-unnecessary local smgr flush. But we don't expect that this
* is a performance-critical path.) As in the unlink code, we want to be
* sure the message is sent before we start changing things on-disk.
*/
CacheInvalidateSmgr(reln->smgr_rnode);
/*
* Do the truncation.
*/
(*reln->storageManager).smgr_truncate(reln, forknum, nblocks);
if (file_truncate_hook)
(*file_truncate_hook)(reln->smgr_rnode);
}
/*
* smgrimmedsync() -- Force the specified relation to stable storage.
*
* Synchronously force all previous writes to the specified relation
* down to disk.
*
* This is useful for building completely new relations (eg, new
* indexes). Instead of incrementally WAL-logging the index build
* steps, we can just write completed index pages to disk with smgrwrite
* or smgrextend, and then fsync the completed index file before
* committing the transaction. (This is sufficient for purposes of
* crash recovery, since it effectively duplicates forcing a checkpoint
* for the completed index. But it is *not* sufficient if one wishes
* to use the WAL log for PITR or replication purposes: in that case
* we have to make WAL entries as well.)
*
* The preceding writes should specify skipFsync = true to avoid
* duplicative fsyncs.
*
* Note that you need to do FlushRelationBuffers() first if there is
* any possibility that there are dirty buffers for the relation;
* otherwise the sync is not very meaningful.
*/
void
smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
{
(*reln->storageManager).smgr_immedsync(reln, forknum);
}
/*
* AtEOXact_SMgr
*
* This routine is called during transaction commit or abort (it doesn't
* particularly care which). All transient SMgrRelation objects are closed.
*
* We do this as a compromise between wanting transient SMgrRelations to
* live awhile (to amortize the costs of blind writes of multiple blocks)
* and needing them to not live forever (since we're probably holding open
* a kernel file descriptor for the underlying file, and we need to ensure
* that gets closed reasonably soon if the file gets deleted).
*/
void
AtEOXact_SMgr(void)
{
dlist_mutable_iter iter;
/*
* Zap all unowned SMgrRelations. We rely on smgrclose() to remove each
* one from the list.
*/
dlist_foreach_modify(iter, &unowned_relns)
{
SMgrRelation rel = dlist_container(SMgrRelationData, node,
iter.cur);
Assert(rel->smgr_owner == NULL);
smgrclose(rel);
}
}
相关信息
相关文章
0
赞
热门推荐
-
2、 - 优质文章
-
3、 gate.io
-
7、 golang
-
9、 openharmony
-
10、 Vue中input框自动聚焦